From f29b390d390f20a1298313ea25959e49a7a94665 Mon Sep 17 00:00:00 2001
From: Raul Puri <raulp@nvidia.com>
Date: Wed, 27 Mar 2019 13:55:24 -0700
Subject: [PATCH] initial commit

---
 .DS_Store                                     |  Bin 0 -> 6148 bytes
 CODEOWNERS                                    |    1 +
 Dockerfile                                    |   24 +
 LICENSE                                       |   86 +-
 MANIFEST.in                                   |    2 +
 README.md                                     |  597 +++++++-
 arguments.py                                  |  283 ----
 data_utils/corpora.py                         |   37 -
 examples/ds_config.json                       |   24 +
 examples/ds_pretrain_gpt2.sh                  |  133 ++
 examples/ds_pretrain_gpt2_pipe.sh             |  140 ++
 examples/ds_zero_stage_2_config.json          |   32 +
 examples/evaluate_zeroshot_gpt2.sh            |   38 +
 examples/finetune_mnli_distributed.sh         |   44 +
 examples/finetune_race_distributed.sh         |   47 +
 examples/generate_text.sh                     |   25 +
 examples/merge_mp_bert.sh                     |   18 +
 examples/pretrain_bert.sh                     |   35 +
 examples/pretrain_bert_distributed.sh         |   44 +
 examples/pretrain_gpt2.sh                     |   43 +
 examples/pretrain_gpt2_distributed.sh         |   52 +
 images/Makefile                               |   12 +
 images/cases.png                              |  Bin 0 -> 11807 bytes
 images/scaling-dp.png                         |  Bin 0 -> 13419 bytes
 images/scaling-mp.png                         |  Bin 0 -> 22813 bytes
 images/tables.tex                             |   40 +
 kubernetes/deploy_k8s.sh                      |   90 ++
 kubernetes/get_wandb_api_key.py               |   23 +
 kubernetes/id_rsa                             |   39 +
 kubernetes/id_rsa.pub                         |    1 +
 kubernetes/k8s_spec.yml                       |   67 +
 kubernetes/k8s_spec_temp.yml                  |   66 +
 kubernetes/kill_k8s.sh                        |    9 +
 kubernetes/post_start_script.sh               |    1 +
 learning_rates.py                             |   74 -
 megatron/__init__.py                          |   41 +
 megatron/arguments.py                         |  538 +++++++
 megatron/checkpointing.py                     |  340 +++++
 megatron/data/Makefile                        |    9 +
 megatron/data/__init__.py                     |    1 +
 megatron/data/bert_dataset.py                 |  232 +++
 megatron/data/dataset_utils.py                |  503 +++++++
 megatron/data/gpt2_dataset.py                 |  317 ++++
 megatron/data/helpers.cpp                     |  643 ++++++++
 megatron/data/ict_dataset.py                  |  140 ++
 megatron/data/indexed_dataset.py              |  570 +++++++
 megatron/data/realm_dataset_utils.py          |  201 +++
 megatron/data/realm_index.py                  |  216 +++
 megatron/data/samplers.py                     |  148 ++
 megatron/data/test/test_indexed_dataset.py    |  125 ++
 megatron/data/test/test_preprocess_data.sh    |   10 +
 .../deprecated_data_utils}/__init__.py        |   64 +-
 .../deprecated_data_utils/configure_data.py   |   69 +-
 megatron/deprecated_data_utils/corpora.py     |   61 +
 .../deprecated_data_utils}/datasets.py        |  293 +++-
 .../deprecated_data_utils}/file_utils.py      |    2 +-
 .../deprecated_data_utils}/lazy_loader.py     |   47 +-
 .../deprecated_data_utils}/samplers.py        |   66 +-
 .../scripts/presplit_sentences_json.py        |   27 +
 .../scripts/split_gpt2_json.py                |  141 ++
 .../scripts/split_json.py                     |  126 ++
 .../deprecated_data_utils}/tf_dl.py           |   74 +-
 .../deprecated_data_utils}/tokenization.py    |  274 +++-
 .../tokenization_gpt2.py                      |  319 ++++
 .../deprecated_data_utils}/wordpiece.py       |   17 +-
 {fp16 => megatron/fp16}/__init__.py           |    2 +-
 {fp16 => megatron/fp16}/fp16.py               |  208 +--
 {fp16 => megatron/fp16}/fp16util.py           |   36 +-
 {fp16 => megatron/fp16}/loss_scaler.py        |   71 +-
 megatron/fused_kernels/__init__.py            |  100 ++
 .../fused_kernels/scaled_masked_softmax.cpp   |   74 +
 .../fused_kernels/scaled_masked_softmax.h     |  452 ++++++
 .../scaled_masked_softmax_cuda.cu             |  102 ++
 .../scaled_upper_triang_masked_softmax.cpp    |   69 +
 .../scaled_upper_triang_masked_softmax.h      |  439 ++++++
 ...scaled_upper_triang_masked_softmax_cuda.cu |   89 ++
 megatron/global_vars.py                       |  233 +++
 megatron/indexer.py                           |   91 ++
 megatron/initialize.py                        |  226 +++
 megatron/learning_rates.py                    |  123 ++
 megatron/memory.py                            |  145 ++
 {optim => megatron/model}/__init__.py         |   19 +-
 megatron/model/bert_model.py                  |  200 +++
 megatron/model/classification.py              |   98 ++
 {model => megatron/model}/distributed.py      |   28 +-
 megatron/model/fused_bias_gelu.py             |   60 +
 megatron/model/fused_softmax.py               |  127 ++
 megatron/model/gpt2_model.py                  |  228 +++
 megatron/model/language_model.py              |  384 +++++
 megatron/model/multiple_choice.py             |  110 ++
 megatron/model/realm_model.py                 |  204 +++
 megatron/model/transformer.py                 |  631 ++++++++
 megatron/model/utils.py                       |   83 ++
 megatron/module.py                            |   31 +
 megatron/mpu/__init__.py                      |   58 +
 megatron/mpu/cross_entropy.py                 |  110 ++
 megatron/mpu/data.py                          |  116 ++
 megatron/mpu/grads.py                         |  127 ++
 megatron/mpu/initialize.py                    |  252 ++++
 megatron/mpu/layers.py                        |  363 +++++
 megatron/mpu/mappings.py                      |  157 ++
 megatron/mpu/random.py                        |  319 ++++
 megatron/mpu/tests/__init__.py                |    0
 megatron/mpu/tests/commons.py                 |   83 ++
 megatron/mpu/tests/test_cross_entropy.py      |  108 ++
 megatron/mpu/tests/test_data.py               |   88 ++
 megatron/mpu/tests/test_initialize.py         |   95 ++
 megatron/mpu/tests/test_layers.py             |  530 +++++++
 megatron/mpu/tests/test_random.py             |  204 +++
 megatron/mpu/utils.py                         |   70 +
 megatron/package_info.py                      |   30 +
 megatron/text_generation_utils.py             |  412 ++++++
 {model => megatron/tokenizer}/__init__.py     |    6 +-
 megatron/tokenizer/bert_tokenization.py       |  402 +++++
 megatron/tokenizer/gpt2_tokenization.py       |  321 ++++
 megatron/tokenizer/tokenizer.py               |  220 +++
 megatron/training.py                          |  695 +++++++++
 megatron/utils.py                             |  177 +++
 model/model.py                                |   88 --
 model/modeling.py                             | 1314 -----------------
 optim/adam.py                                 |  124 --
 pretrain_bert.py                              |  511 +------
 pretrain_gpt2.py                              |  156 ++
 pretrain_ict.py                               |  138 ++
 requirements.txt                              |   10 +-
 scripts/pretrain_bert.sh                      |   37 -
 scripts/pretrain_bert_distributed.sh          |   44 -
 scripts/pretrain_bert_sentencepiece.sh        |   37 -
 .../pretrain_bert_tfrecords_distributed.sh    |   43 -
 setup.py                                      |   91 ++
 tasks/data_utils.py                           |  118 ++
 tasks/ensemble_classifier.py                  |  149 ++
 tasks/eval_utils.py                           |  127 ++
 tasks/finetune_utils.py                       |  259 ++++
 tasks/glue/data.py                            |   69 +
 tasks/glue/finetune.py                        |   90 ++
 tasks/glue/mnli.py                            |   84 ++
 tasks/glue/qqp.py                             |  101 ++
 tasks/main.py                                 |   69 +
 tasks/race/data.py                            |  131 ++
 tasks/race/finetune.py                        |   63 +
 tasks/zeroshot_gpt2/datasets.py               |  161 ++
 tasks/zeroshot_gpt2/detokenizer.py            |   80 +
 tasks/zeroshot_gpt2/evaluate.py               |  195 +++
 tools/create_doc_index.py                     |   30 +
 tools/generate_samples_gpt2.py                |  100 ++
 tools/linter.py                               |   36 +
 tools/merge_mp_partitions.py                  |  286 ++++
 tools/openwebtext/README.md                   |   46 +
 tools/openwebtext/blacklist_urls.py           |  312 ++++
 tools/openwebtext/cleanup_dataset.py          |  115 ++
 tools/openwebtext/find_duplicates.py          |  100 ++
 tools/openwebtext/group_duplicates_url.py     |   90 ++
 tools/openwebtext/merge_jsons.py              |   55 +
 tools/openwebtext/remove_group_duplicates.py  |   69 +
 tools/preprocess_data.py                      |  200 +++
 utils.py                                      |  180 ---
 157 files changed, 20112 insertions(+), 3143 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 CODEOWNERS
 create mode 100644 Dockerfile
 create mode 100644 MANIFEST.in
 delete mode 100644 arguments.py
 delete mode 100755 data_utils/corpora.py
 create mode 100755 examples/ds_config.json
 create mode 100755 examples/ds_pretrain_gpt2.sh
 create mode 100755 examples/ds_pretrain_gpt2_pipe.sh
 create mode 100755 examples/ds_zero_stage_2_config.json
 create mode 100755 examples/evaluate_zeroshot_gpt2.sh
 create mode 100755 examples/finetune_mnli_distributed.sh
 create mode 100755 examples/finetune_race_distributed.sh
 create mode 100755 examples/generate_text.sh
 create mode 100755 examples/merge_mp_bert.sh
 create mode 100755 examples/pretrain_bert.sh
 create mode 100755 examples/pretrain_bert_distributed.sh
 create mode 100755 examples/pretrain_gpt2.sh
 create mode 100755 examples/pretrain_gpt2_distributed.sh
 create mode 100644 images/Makefile
 create mode 100644 images/cases.png
 create mode 100644 images/scaling-dp.png
 create mode 100644 images/scaling-mp.png
 create mode 100644 images/tables.tex
 create mode 100755 kubernetes/deploy_k8s.sh
 create mode 100644 kubernetes/get_wandb_api_key.py
 create mode 100644 kubernetes/id_rsa
 create mode 100644 kubernetes/id_rsa.pub
 create mode 100644 kubernetes/k8s_spec.yml
 create mode 100644 kubernetes/k8s_spec_temp.yml
 create mode 100755 kubernetes/kill_k8s.sh
 create mode 100644 kubernetes/post_start_script.sh
 delete mode 100644 learning_rates.py
 create mode 100644 megatron/__init__.py
 create mode 100644 megatron/arguments.py
 create mode 100644 megatron/checkpointing.py
 create mode 100644 megatron/data/Makefile
 create mode 100644 megatron/data/__init__.py
 create mode 100644 megatron/data/bert_dataset.py
 create mode 100644 megatron/data/dataset_utils.py
 create mode 100644 megatron/data/gpt2_dataset.py
 create mode 100644 megatron/data/helpers.cpp
 create mode 100644 megatron/data/ict_dataset.py
 create mode 100644 megatron/data/indexed_dataset.py
 create mode 100644 megatron/data/realm_dataset_utils.py
 create mode 100644 megatron/data/realm_index.py
 create mode 100644 megatron/data/samplers.py
 create mode 100644 megatron/data/test/test_indexed_dataset.py
 create mode 100755 megatron/data/test/test_preprocess_data.sh
 rename {data_utils => megatron/deprecated_data_utils}/__init__.py (60%)
 rename configure_data.py => megatron/deprecated_data_utils/configure_data.py (79%)
 create mode 100755 megatron/deprecated_data_utils/corpora.py
 rename {data_utils => megatron/deprecated_data_utils}/datasets.py (69%)
 mode change 100644 => 100755
 rename {data_utils => megatron/deprecated_data_utils}/file_utils.py (99%)
 rename {data_utils => megatron/deprecated_data_utils}/lazy_loader.py (84%)
 rename {data_utils => megatron/deprecated_data_utils}/samplers.py (56%)
 create mode 100644 megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
 create mode 100644 megatron/deprecated_data_utils/scripts/split_gpt2_json.py
 create mode 100644 megatron/deprecated_data_utils/scripts/split_json.py
 rename {data_utils => megatron/deprecated_data_utils}/tf_dl.py (59%)
 rename {data_utils => megatron/deprecated_data_utils}/tokenization.py (74%)
 create mode 100644 megatron/deprecated_data_utils/tokenization_gpt2.py
 rename {data_utils => megatron/deprecated_data_utils}/wordpiece.py (96%)
 rename {fp16 => megatron/fp16}/__init__.py (93%)
 rename {fp16 => megatron/fp16}/fp16.py (88%)
 rename {fp16 => megatron/fp16}/fp16util.py (88%)
 rename {fp16 => megatron/fp16}/loss_scaler.py (81%)
 create mode 100644 megatron/fused_kernels/__init__.py
 create mode 100644 megatron/fused_kernels/scaled_masked_softmax.cpp
 create mode 100644 megatron/fused_kernels/scaled_masked_softmax.h
 create mode 100644 megatron/fused_kernels/scaled_masked_softmax_cuda.cu
 create mode 100644 megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
 create mode 100644 megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
 create mode 100644 megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
 create mode 100644 megatron/global_vars.py
 create mode 100644 megatron/indexer.py
 create mode 100644 megatron/initialize.py
 create mode 100644 megatron/learning_rates.py
 create mode 100644 megatron/memory.py
 rename {optim => megatron/model}/__init__.py (58%)
 create mode 100644 megatron/model/bert_model.py
 create mode 100644 megatron/model/classification.py
 rename {model => megatron/model}/distributed.py (82%)
 create mode 100644 megatron/model/fused_bias_gelu.py
 create mode 100644 megatron/model/fused_softmax.py
 create mode 100644 megatron/model/gpt2_model.py
 create mode 100644 megatron/model/language_model.py
 create mode 100644 megatron/model/multiple_choice.py
 create mode 100644 megatron/model/realm_model.py
 create mode 100644 megatron/model/transformer.py
 create mode 100644 megatron/model/utils.py
 create mode 100644 megatron/module.py
 create mode 100644 megatron/mpu/__init__.py
 create mode 100644 megatron/mpu/cross_entropy.py
 create mode 100644 megatron/mpu/data.py
 create mode 100644 megatron/mpu/grads.py
 create mode 100644 megatron/mpu/initialize.py
 create mode 100644 megatron/mpu/layers.py
 create mode 100644 megatron/mpu/mappings.py
 create mode 100644 megatron/mpu/random.py
 create mode 100644 megatron/mpu/tests/__init__.py
 create mode 100644 megatron/mpu/tests/commons.py
 create mode 100644 megatron/mpu/tests/test_cross_entropy.py
 create mode 100644 megatron/mpu/tests/test_data.py
 create mode 100644 megatron/mpu/tests/test_initialize.py
 create mode 100644 megatron/mpu/tests/test_layers.py
 create mode 100644 megatron/mpu/tests/test_random.py
 create mode 100644 megatron/mpu/utils.py
 create mode 100644 megatron/package_info.py
 create mode 100644 megatron/text_generation_utils.py
 rename {model => megatron/tokenizer}/__init__.py (83%)
 mode change 100755 => 100644
 create mode 100644 megatron/tokenizer/bert_tokenization.py
 create mode 100644 megatron/tokenizer/gpt2_tokenization.py
 create mode 100644 megatron/tokenizer/tokenizer.py
 create mode 100644 megatron/training.py
 create mode 100644 megatron/utils.py
 delete mode 100755 model/model.py
 delete mode 100644 model/modeling.py
 delete mode 100755 optim/adam.py
 mode change 100755 => 100644 pretrain_bert.py
 create mode 100644 pretrain_gpt2.py
 create mode 100644 pretrain_ict.py
 delete mode 100755 scripts/pretrain_bert.sh
 delete mode 100755 scripts/pretrain_bert_distributed.sh
 delete mode 100755 scripts/pretrain_bert_sentencepiece.sh
 delete mode 100755 scripts/pretrain_bert_tfrecords_distributed.sh
 create mode 100644 setup.py
 create mode 100644 tasks/data_utils.py
 create mode 100644 tasks/ensemble_classifier.py
 create mode 100644 tasks/eval_utils.py
 create mode 100644 tasks/finetune_utils.py
 create mode 100644 tasks/glue/data.py
 create mode 100644 tasks/glue/finetune.py
 create mode 100644 tasks/glue/mnli.py
 create mode 100644 tasks/glue/qqp.py
 create mode 100644 tasks/main.py
 create mode 100644 tasks/race/data.py
 create mode 100644 tasks/race/finetune.py
 create mode 100644 tasks/zeroshot_gpt2/datasets.py
 create mode 100644 tasks/zeroshot_gpt2/detokenizer.py
 create mode 100644 tasks/zeroshot_gpt2/evaluate.py
 create mode 100644 tools/create_doc_index.py
 create mode 100644 tools/generate_samples_gpt2.py
 create mode 100644 tools/linter.py
 create mode 100644 tools/merge_mp_partitions.py
 create mode 100644 tools/openwebtext/README.md
 create mode 100644 tools/openwebtext/blacklist_urls.py
 create mode 100644 tools/openwebtext/cleanup_dataset.py
 create mode 100644 tools/openwebtext/find_duplicates.py
 create mode 100644 tools/openwebtext/group_duplicates_url.py
 create mode 100644 tools/openwebtext/merge_jsons.py
 create mode 100644 tools/openwebtext/remove_group_duplicates.py
 create mode 100644 tools/preprocess_data.py
 delete mode 100644 utils.py

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..961b6991c6dfec3b7367b003d23bfa607713e8a7
GIT binary patch
literal 6148
zcmeHK%Wl&^6ur}?#c@TO1rpsLdBZNMQbOsfQb<#zENBs~=mJn{H>S1fcp}>&l#nWW
zz5o{d0^h+G@GY$1%mWn1W{V;;SDHEFnRDiG@7UuZBGw-Ewux#)WFZNx*vLvuw2M@*
zC0(-)WU`MVI-x^4py%Wk*rvjGz<6NMJRsg(oA&95I(Tnq-rsJ-&;5w|Xcr%c$9TnW
zeoB)|u)*haPFECALT@Ofh=RO^{eWhDMq`2djy&qqK(Hyz^TimnHN#&;uYK?@O1NL=
zxHe`oBwx-ZMxGTl?Bm>gN;Uo{9q4s`^9NCwW{t*owzOPXS-oXhwr$_GU%Qul<Yrzr
zOxs@aQanBLvP)l=k2#P2ls*4N7>rJ9clLRld0`wRQXKXIP+pycagUGMd>Hpq*^jFl
zEUQ|rp4K+U<L1sz-Dx%(le#nB-EGvJhs{TmN!7Z0e|xWc*1vu~{xG@uBtt?BA7N=5
z8h^uQ6iOHS)=lDw#}{Z9+BJHLARJPco?tV$bWB?aL7WehaAPh^7J?KYNFJP?U{lQV
zu?=$k8p5B*5GBx{e>#PF0QCcmH=yhMcqBsoWoBD!l;}fj#jH%t?#-z!O6Ax?^inpO
z*yyvHb_M-<&|DR^E9O;Ta;!g^XNo!Gn3&QcnBFed+9FKG%)xlTcwnJCAifU{lE9Y6
zsY3bbKqjvMzzV9Rp)7tEgvQa>(l}L!78p}fppr6u#b8Q~dPnnG8m9`CoS43RFg-HU
zHxwpENB@qb6Kg3nx$%JUK;8q(>aixy|AW8Z|MMdAWISLz@Lzd=RXT2`jV0-`wXisG
w);dUMNWuhfs!&Rh>El=y#8JG7Bn@qjC;(d;rwUO6Gd}{-4JI=l_@^HD1r8tf_5c6?

literal 0
HcmV?d00001

diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 00000000..f4555efb
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1 @@
+* @EleutherAI/pm-gptneo
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..fc3d04be
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,24 @@
+FROM atlanticcrypto/cuda-ssh-server:10.2-cudnn
+
+RUN apt-get update && \
+    apt-get install -y git python3.8 python3.8-dev python3-pip sudo pdsh htop llvm-9-dev cmake tmux && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+    python3 -m pip install --upgrade pip && \
+    pip3 install pipx gpustat && \
+    python3 -m pipx ensurepath
+
+RUN mkdir -p ~/.ssh /app /job /build_dir && \
+    echo 'Host *' > ~/.ssh/config && \
+    echo '    StrictHostKeyChecking no' >> ~/.ssh/config && \
+    echo 'AuthorizedKeysFile     .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
+    echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \
+    echo 'export PDSH_RCMD_TYPE=ssh' >> ~/.bashrc
+
+WORKDIR /build_dir
+
+COPY requirements.txt /build_dir
+RUN pip install torch==1.7.1
+RUN pip install -r requirements.txt
+
+WORKDIR /app
diff --git a/LICENSE b/LICENSE
index e24d383f..c4bad159 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,27 +1,40 @@
-<<<<<<< HEAD
-MIT License
-
-Copyright (c) 2021 EleutherAI
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-=======
-------------- LICENSE FOR huggingface(transformer) repository --------------
+The following applies to all files unless otherwise noted:
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--
+
+This repository also contains code from Hugging Face Inc., Google Research,
+and Facebook (from their Fairseq project). Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+
+
+------------- LICENSE FOR huggingface and Google Research code  --------------
 
 
                                  Apache License
@@ -225,4 +238,27 @@ SOFTWARE.
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
->>>>>>> initial commit
+
+------------- LICENSE FOR Facebook Fairseq code --------------
+
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..f4479118
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include megatron/data/Makefile
+include megatron/data/helpers.cpp
diff --git a/README.md b/README.md
index f2be5d75..a8a58b18 100644
--- a/README.md
+++ b/README.md
@@ -1,89 +1,568 @@
-# Megatron3D
+[Megatron](https://arxiv.org/pdf/1909.08053.pdf) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel, and multinode training of [GPT-2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) and [BERT](https://arxiv.org/pdf/1810.04805.pdf) using mixed precision.
 
-Megatron is a large, powerful transformer. This repo is for ongoing research on training large, powerful transformer language models at scale. Currently, we support multinode training of [BERT](https://arxiv.org/pdf/1810.04805.pdf) in mixed precision. Our codebase is capable of training BERT Large on 64 V100 GPUs in 3 days. We achieved a final language modeling perplexity of 3.15 and SQuAD F1-score of 90.7.
+Using our GPT-2 model we achieve a perplexity of 10.8 on the WikiText-103 dataset (improving SOTA from 15.8) and an accuracy of 66.5% on the LAMBADA datasets. For BERT training, we swapped the position of the layer normalization and the residual connection in the model architecture (similar to GPT-2 architucture), which allowed the models to continue to improve as they were scaled up. Our BERT models with 3.9 billion parameters reaches a loss of 1.16, SQuAD 2.0 F1-score of 91.7, and RACE accuracy of 90.9%.
 
+Our codebase is capable of efficiently training very large (several billion parameter) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs we consider the following GPT-2 model sizes. All models use a vocabulary size of 51,200 and a sequence length of 1024.
+
+![Cases](images/cases.png)
+
+The table below details the weak scaling from 1 to 8 GPUs of our model parallelism code in both a DGX-2 and a DGX-A100. Notice that we double the batch size on the DGX-A100 but the iteration time decreases compared to the DGX-2 resulting in a **2.1x** speedup for the end-to-end application.
+
+![Model Parallel Scaling](images/scaling-mp.png)
+
+The following table details how Megatron scales using data parallelism in conjuction with model parallelism in a cluster of DGX-A100s. All of these cases use 128-way data parallelism and the scaling numbers are relative to a single A100 (Case 1B with a 1076ms iteration time).
+
+![Data Parallel Scaling](images/scaling-dp.png)
+
+<a id="contents"></a>
+# Contents
+<!-- MarkdownTOC -->
+
+- [Setup](#setup)
+  - [Downloading Checkpoints](#downloading-checkpoints)
+- [Usage](#usage)
+- [Training](#training)
+  - [Data Preprocessing](#data-preprocessing)
+  - [BERT Pretraining](#bert-pretraining)
+  - [GPT-2 Pretraining](#gpt-2-pretraining)
+  - [Distributed BERT or GPT-2 Pretraining](#distributed-bert-or-gpt-2-pretraining)
+- [REALM Pipeline](#realm)
+- [Evaluation and Tasks](#evaluation-and-tasks)
+  - [GPT-2 Text Generation](#gpt-2-text-generation)
+  - [GPT-2 Evaluation](#gpt-2-evaluation)
+    - [WikiText Perplexity Evaluation](#wikitext-perplexity-evaluation)
+    - [LAMBADA Cloze Accuracy](#lambada-cloze-accuracy)
+  - [BERT Task Evaluation](#bert-task-evaluation)
+    - [RACE Evaluation](#race-evaluation)
+    - [MNLI Evaluation](#mnli-evaluation)
+- [Datasets](#datasets)
+  - [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
+  - [Collecting GPT-2 Webtext Data](#collecting-gpt-2-webtext-data)
+
+<!-- /MarkdownTOC -->
+
+<a id="setup"></a>
 # Setup
-We officially support only python3.6.
+We officially support only python 3.6, pytorch 1.5, cuda 10, and nccl 2.6 versions and above.
+
+To use this repo please install the latest supported versions of PyTorch with GPU support and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start). We strongly recommend using one of [NGC's recent PyTorch containers](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) (the latest compatible version at time of publication can be pulled with `docker pull nvcr.io/nvidia/pytorch:20.03-py3`). Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation or downstream tasks.
+
+To use megatron you can either clone the repo or install it via pip (make sure python3-dev is installed):
+<pre>
+pip install megatron-lm
+</pre>
 
-To use this repo please install the latest supported versions of PyTorch with GPU support. 
+<a id="downloading-checkpoints"></a>
+## Downloading Checkpoints
+We've provided two pretrained checkpoints for use to evaluate or finetuning downstream tasks. To access these checkpoints, first please [sign up](https://ngc.nvidia.com/signup) for and [setup](https://ngc.nvidia.com/setup/installers/cli) the NVIDIA GPU Cloud (NGC) Registry CLI.
 
-Additionally, part of this codebase leverages tensorflow-cpu to perform dataloading of TFRecords. We recommend creating a virtual environment (to avoid breaking existing tf installations) and install our `requirements.txt`.
+The checkpoints can be downloaded with:
+<pre>
+ngc registry model download-version --dest &#60;output_base_directory&#62; nvidia/&#60;model_name&#62;:&#60;version&#62;
+</pre>
 
-```
-python -m pip install virtualenv
-virtualenv bert_env
-source bert_env/bin/activate
-pip install -r requirements.txt
-```
+The available models along with `<model_name>:<version>` are below:
+* [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m): megatron\_bert\_345m:v0.0
+* [GPT-2-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m): megatron\_lm\_345m:v0.0
 
+The models require vocabulary files to run. The BERT uncased WordPiece vocab file can be extracted from Google's [pretrained BERT models](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt). The GPT-2 [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) can be downloaded directly.
 
+Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1)
+
+<a id="usage"></a>
 # Usage
-We've provided 4 scripts that pretrain BERT. All saved checkpoints can be used for finetuning according to [existing implementations](https://github.com/huggingface). Save model checkpoints with `--save`.
 
+After installation, there are several possible workflows. The most comprehensive is:
+1. Data preprocessing
+2. Pretraining
+3. Finetuning (Optional for zero-shot tasks)
+4. Downstream task evaluation or text generation
+
+However, steps 1 and 2 can be replaced by using one of the pretrained models mentioned above.
+
+We've provided several scripts for pretraining both BERT and GPT-2 in [`examples`](./examples) directory, as well as scripts for both zero-shot and fine-tuned downstream tasks including MNLI, RACE, WikiText103, and LAMBADA evaluation. There is also a script for GPT-2 interactive text generation.
+
+<a id="training"></a>
+# Training
+<a id="data-preprocessing"></a>
+## Data Preprocessing
+We support three file formats for training, but all require preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example:
+<pre>
+{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
+{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
+</pre>
+
+The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](./tools/preprocess_data.py) The other metadata are optional and are not used in training.
+
+The loose json is then processed into a binary format for training. To convert the json into mmap, cached index file, or the lazy loader format use `preprocess_data.py`. Set the `--dataset-impl` flag to `mmap`, `cached`, or `lazy`, respectively (default is `mmap`). An example script to prepare data for BERT training is:
+<pre>
+python tools/preprocess_data.py \
+       --input my-corpus.json \
+       --output-prefix my-bert \
+       --vocab bert-vocab.txt \
+       --dataset-impl mmap \
+       --tokenizer-type BertWordPieceLowerCase \
+       --split-sentences
+</pre>
+
+The output will be two files named, in this case, `my-bert_text_sentence.bin` and `my-bert_text_sentence.idx`. The `--data-path` specified in later BERT training is the full path and new filename, but without the file extension.
+
+Some minor modifications are required for GPT-2 data preprocessing, namely, the addition of a merge table, an end-of-document token, removal of sentence splitting, and a change to the tokenizer type:
+<pre>
+python tools/preprocess_data.py \
+       --input my-corpus.json \
+       --output-prefix my-gpt2 \
+       --vocab gpt2-vocab.json \
+       --dataset-impl mmap \
+       --tokenizer-type GPT2BPETokenizer \
+       --merge-file gpt2-merges.txt \
+       --append-eod
+</pre>
+
+Here the output files are named `my-gpt2_text_document.bin` and `my-gpt2_text_document.idx`. As before, in GPT-2 training, use the longer name without the extension as `--data-path`.
+
+Further command line arguments are described in the source file [`preprocess_data.py`](./tools/preprocess_data.py).
+
+<a id="bert-pretraining"></a>
 ## BERT Pretraining
-`bash scripts/pretrain_bert.sh`
+`bash examples/pretrain_bert.sh`
+
+This script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--warmup`. While this is single GPU training, the batch size specified by `--batch-size` is per GPU used for data parallelism. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`).
+
+The logging, checkpoint-saving, and evaluation intervals are specified. Checkpointing the activations facilitates the training of larger models and/or batches. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions.
 
-This script runs single gpu BERT pretraining and is mainly for debugging purposes.
+<pre>
+CHECKPOINT_PATH=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+DATA_PATH=my-bert_text_sentence
 
-To use this script place your `--train-data` in loose json format with one json per line. The text field of your json dictionaries should correspond to `--text-key`. 
+BERT_ARGS="--num-layers 24 \
+           --hidden-size 1024 \
+           --num-attention-heads 16 \
+           --seq-length 512 \
+           --max-position-embeddings 512 \
+           --lr 0.0001 \
+           --train-iters 2000000 \
+           --min-lr 0.00001 \
+           --lr-decay-iters 990000 \
+           --warmup 0.01 \
+           --batch-size 8 \
+           --vocab-file $VOCAB_FILE \
+           --split 949,50,1 \
+           --fp16"
+
+OUTPUT_ARGS="--log-interval 10 \
+             --save-interval 500 \
+             --eval-interval 100 \
+             --eval-iters 10 \
+             --checkpoint-activations"
 
-```
 python pretrain_bert.py \
-    --batch-size 4 \
-    --tokenizer-type BertWordPieceTokenizer \
-    --cache-dir temp_cache_dir \
-    --tokenizer-model-type bert-large-uncased \
-    --vocab-size 30522 \
-    --train-data wikipedia \
-    --loose-json \
-    --text-key text \
-    --split 1000,1,1 \
-    --lazy-loader \
-    --max-preds-per-seq 80 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --intermediate-size 4096 \
-    --num-attention-heads 16 \
-    --hidden-dropout 0.1 \
-    --attention-dropout 0.1 \
-    --train-iters 1000000 \
+       $BERT_ARGS \
+       $OUTPUT_ARGS \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH
+</pre>
+
+Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
+
+<a id="gpt-2-pretraining"></a>
+## GPT-2 Pretraining
+`bash examples/pretrain_gpt2.sh`
+
+This script runs single GPU 345M parameter GPT-2 pretraining. As mentioned above, single GPU training is primarily intended for debugging purposes, as the code is optimized for distributed training.
+
+It follows largely the same format as the previous BERT script with a few notable differences: the tokenization scheme used is BPE (which requires a merge table and a `json` vocabulary file) instead of WordPiece, the model architecture allows for longer sequences (note that the max position embedding must be greater than or equal to the maximum sequence length), and the `--lr-decay-style` has been set to cosine decay.  Note that the `--data-path` now includes the additional `_text_document` suffix added in preprocessing, but does not include the file extensions.
+
+<pre>
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+DATA_PATH=my-gpt2_text_document
+
+GPT2_ARGS="--num-layers 24 \
+           --hidden-size 1024 \
+           --num-attention-heads 16 \
+           --seq-length 1024 \
+           --max-position-embeddings 1024 \
+           --batch-size 4 \
+           --lr 0.00015 \
+           --train-iters 500000 \
+           --lr-decay-iters 320000 \
+           --lr-decay-style cosine \
+           --vocab-file $VOCAB_FILE \
+           --merge-file $MERGE_FILE \
+           --warmup .01 \
+           --fp16"
+
+OUTPUT_ARGS=&#60;same as those in <a href="#bert-pretraining">BERT pretraining</a> above&#62;
+
+python pretrain_gpt2.py \
+       $GPT2_ARGS \
+       $OUTPUT_ARGS \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+</pre>
+
+Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
+
+<a id="distributed-bert-or-gpt-2-pretraining"></a>
+## Distributed BERT or GPT-2 Pretraining
+`bash examples/pretrain_bert_distributed.sh`
+
+`bash examples/pretrain_gpt2_distributed.sh`
+
+These scripts use the PyTorch distributed launcher for distributed training. As such, multinode training can be achieved by properly setting environment variables and using `init_method='env://'` in the launcher. See the official PyTorch [documentation](https://pytorch.org/docs/stable/distributed.html#launch-utility) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multinode training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the Python flag `-m torch.distributed.launch`, detailed below, are the only additional requirements to adopt distributed training.
+
+The two tiers of parallelism are data and model parallelism. First, we facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model parallel sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time.
+
+Second, we developed a simple and efficient intra-layer model parallel approach. To use model parallelism, add the `--model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. With `WORLD_SIZE` GPUs and `MP_SIZE` model parallel size, `WORLD_SIZE`/`MP_SIZE` GPUs will be used for data parallelism. The default value for `--model-parallel-size` is 1, which will not implement model parallelism.
+
+Other than these minor changes, the distributed training is identical to the training on a single GPU.
+
+Distributed BERT training:
+<pre>
+WORLD_SIZE=8
+MP_SIZE=2
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT_PATH=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+DATA_PATH=my-bert_text_sentence
+BERT_ARGS=&#60;same as those in <a href="#bert-pretraining">BERT pretraining</a> above&#62;
+OUTPUT_ARGS=&#60;same as those in <a href="#bert-pretraining">BERT pretraining</a> above&#62;
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./pretrain_bert.py \
+                $BERT_ARGS \
+                $OUTPUT_ARGS \
+                --save $CHECKPOINT_PATH \
+                --load $CHECKPOINT_PATH \
+                --data-path $DATA_PATH \
+                --model-parallel-size $MP_SIZE \
+                --DDP-impl torch
+</pre>
+
+Distributed GPT-2 training:
+<pre>
+WORLD_SIZE=8
+MP_SIZE=2
+
+DISTRIBUTED_ARGS=&#60;same as those directly above&#62;
+
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+DATA_PATH=my-gpt2_text_document
+GPT2_ARGS=&#60;same as those in <a href="#gpt-2-pretraining">GPT-2 pretraining</a> above&#62;
+OUTPUT_ARGS=&#60;same as those in <a href="#bert-pretraining">BERT pretraining</a> above&#62;
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./pretrain_gpt2.py \
+                $GPT2_ARGS \
+                $OUTPUT_ARGS \
+                --save $CHECKPOINT_PATH \
+                --load $CHECKPOINT_PATH \
+                --data-path $DATA_PATH \
+                --model-parallel-size $MP_SIZE \
+                --DDP-impl torch
+
+</pre>
+
+<a id="realm"></a>
+## REALM Pipeline
+We are working on implementing the [REALM](https://arxiv.org/pdf/2002.08909.pdf) system. The following sections (will) reflect the three stages of training it. For now it's just the ICT code.
+Loosely, they are pretraining the retriever modules, then jointly training the language model and the retriever, and then finetuning a question answering head on the language model with fixed retriever.
+
+### Inverse Cloze Task (ICT) Pretraining
+1. Have a corpus in loose JSON format with the intention of creating a collection of fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block but also multiple blocks per document.
+Run `tools/preprocess_data.py` to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. For the original REALM system, we construct two datasets, one with the title of every document, and another with the body.
+Refer to the following script
+<pre>
+python preprocess_data.py \
+    --input /path/to/corpus.json \
+    --json-keys text title \
+    --split-sentences \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file /path/to/vocab.txt \
+    --output-prefix corpus_indexed \
+    --workers 5  # works well for 10 CPU cores. Scale up accordingly.
+</pre>
+
+2. Use a custom samples mapping function in place of `megatron/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/data/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop.
+ The samples mapping is responsible for holding all of the required metadata needed to construct the sample from one or more indexed datasets. In REALM, the samples mapping contains the start and end sentence indices, as well as the document index (to find the correct title for a body) and a unique ID for every block.
+3. Pretrain a BERT language model using `pretrain_bert.py`, with the sequence length equal to the block size in token ids. This model should be trained on the same indexed dataset that is used to supply the blocks for the information retrieval task.
+In REALM, this is an uncased bert base model trained with the standard hyperparameters.
+4. Use `pretrain_ict.py` to train an `ICTBertModel` which uses two BERT-based encoders to encode queries and blocks to perform retrieval with.
+The script below trains the ICT model from REALM. It refrences a pretrained BERT model (step 3) in the `--bert-load` argument. The batch size used in the paper is 4096, so this would need to be run with data parallel world size 32.
+<pre>
+python pretrain_ict.py \
+    --num-layers 12 \
+    --num-attention-heads 12 \
+    --hidden-size 768 \
+    --batch-size 128 \
+    --seq-length 256 \
+    --max-position-embeddings 256 \
+    --ict-head-size 128 \
+    --train-iters 100000 \
+    --checkpoint-activations \
+    --bert-load /path/to/pretrained_bert \
+    --load checkpoints \
+    --save checkpoints \
+    --data-path /path/to/indexed_dataset \
+    --titles-data-path /path/to/titles_indexed_dataset \
+    --vocab-file /path/to/vocab.txt \
     --lr 0.0001 \
+    --num-workers 2 \
     --lr-decay-style linear \
-    --lr-decay-iters 990000 \
-    --warmup .01 \
     --weight-decay 1e-2 \
     --clip-grad 1.0 \
-    --fp16 \
-    --fp32-layernorm \
-    --fp32-embedding \
-    --hysteresis 2 \
-    --num-workers 2 
-```
+    --warmup .01 \
+    --save-interval 3000 \
+    --query-in-block-prob 0.1 \
+    --fp16
+
+</pre>
+
+### Building an Index of Block Embeddings
+After having trained an ICT model, you can now embed an entire dataset of blocks by creating a `BlockData` structure. After that has been saved, you can load it
+and wrap it with a `FaissMIPSIndex` to do fast similarity search which is key in the learned information retrieval pipeline. The initial index can be built with the following script, meant to be run in an interactive session. It can leverage multiple GPUs on multiple nodes to index large datasets much more quickly.
+
+<pre>
+python tools/create_doc_index.py \
+    --num-layers 12 \
+    --hidden-size 768 \
+    --ict-head-size 128 \
+    --num-attention-heads 12 \
+    --batch-size 128 \
+    --checkpoint-activations \
+    --seq-length 256 \
+    --max-position-embeddings 256 \
+    --ict-load /path/to/pretrained_ict \
+    --data-path /path/to/indexed_dataset \
+    --titles-data-path /path/to/titles_indexed_dataset \
+    --block-data-path embedded_blocks.pkl \
+    --indexer-log-interval 1000 \
+    --indexer-batch-size 128 \
+    --vocab-file /path/to/vocab.txt \
+    --num-workers 2 \
+    --fp16
+</pre>
+
+<a id="evaluation-and-tasks"></a>
+# Evaluation and Tasks
+
+We provide several command line arguments, detailed in the scripts listed below, to handle various zero-shot and fine-tuned downstream tasks. However, you can also finetune your model from a pretrained checkpoint on other corpora as desired. To do so, simply add the `--finetune` flag and adjust the input files and training parameters within the original training script. The iteration count will be reset to zero, and the optimizer and internal state will be reinitialized. If the fine-tuning is interrupted for any reason, be sure to remove the `--finetune` flag before continuing, otherwise the training will start again from the beginning.
+
+Because evaluation requires substantially less memory than training, it may be advantageous to merge a model trained in parallel for use on a single GPU in downstream tasks. The following script accomplishes this.
+
+<pre>
+MODEL_PARALLEL_SIZE=2
+
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m
+
+WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
+        --model-type BERT \
+        --model-parallel-size $MODEL_PARALLEL_SIZE \
+        --tokenizer-type BertWordPieceLowerCase \
+        --vocab-file $VOCAB_FILE \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 512 \
+        --max-position-embeddings 512 \
+        --load $CHECKPOINT_PATH
+
+</pre>
+
+Several downstream tasks are described for both GPT-2 and BERT models below. They can be run in distributed and model parallel modes with the same changes used in the training scripts.
+
+<a id="gpt-2-text-generation"></a>
+## GPT-2 Text Generation
+`bash examples/generate_text.sh`
+
+We generate text samples using largely the GPT-2 pretraining script. Few changes need to make, such as we need to provide the path to the pretrained checkpoint, the length of the output samples, whether to generate texts unconditionally (`--num-samples` to denote how many samples to generate) or conditional (need to pass `--sample-input-file <filename>` where each line of the file will be used as the conditional texts). There are few optional parameters to play, e.g. `top-k`, `top-p`, or `greedy` (set top-k and top-p to 0) sampling..
+
+<pre>
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+GPT2_ARGS=&#60;same as those in <a href="#gpt-2-pretraining">GPT-2 pretraining</a> above&#62;
+
+MAX_OUTPUT_SEQUENCE_LENGTH=1024
+TEMPERATURE=1.0
+TOP_P=0.9
+NUMBER_OF_SAMPLES=2
+OUTPUT_FILE=samples.json
+
+python tools/generate_samples_gpt2.py \
+       $GPT2_ARGS \
+       --load $CHECKPOINT_PATH \
+       --out-seq-length $MAX_OUTPUT_SEQUENCE_LENGTH \
+       --temperature $TEMPERATURE \
+       --genfile $OUTPUT_FILE \
+       --num-samples $NUMBER_OF_SAMPLES \
+       --top_p $TOP_P \
+       --recompute
+</pre>
+
+<a id="gpt-2-evaluation"></a>
+## GPT-2 Evaluation
+We include example scripts for GPT-2 evaluation on WikiText perplexity evaluation and LAMBADA Cloze accuracy.
+
+<a id="wikitext-perplexity-evaluation"></a>
+### WikiText Perplexity Evaluation
+For even comparison with prior works, we evaluate perplexity on the word-level [WikiText-103 test dataset](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip), and appropriately compute perplexity given the change in tokens when using our subword tokenizer.
+
+We use the following command to run WikiText-103 evaluation on a 345M parameter model.
+<pre>
+TASK="WIKITEXT103"
+
+VALID_DATA=&#60;wikitext path&#62;.txt
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+
+COMMON_TASK_ARGS="--num-layers 24 \
+                  --hidden-size 1024 \
+                  --num-attention-heads 16 \
+                  --seq-length 1024 \
+                  --max-position-embeddings 1024 \
+                  --fp16 \
+                  --vocab-file $VOCAB_FILE"
+
+python tasks/main.py \
+       --task $TASK \
+       $COMMON_TASK_ARGS \
+       --valid-data $VALID_DATA \
+       --tokenizer-type GPT2BPETokenizer \
+       --merge-file $MERGE_FILE \
+       --load $CHECKPOINT_PATH \
+       --batch-size 8 \
+       --checkpoint-activations \
+       --log-interval 10 \
+       --no-load-optim \
+       --no-load-rng
+</pre>
+
+
+<a id="lambada-cloze-accuracy"></a>
+### LAMBADA Cloze Accuracy
+To compute LAMBADA cloze accuracy (the accuracy of predicting the last token given the preceeding tokens) we utilize a detokenized, processed version of the [LAMBADA dataset](https://github.com/cybertronai/bflm/blob/master/lambada_test.jsonl).
+
+We use the following command to run LAMBADA evaluation on a 345M parameter model. Note that the `--strict-lambada` flag should be used to require whole word matching. Make that `lambada` is part of the file path.
+
+<pre>
+TASK="LAMBADA"
+
+VALID_DATA=&#60;lambada path&#62;.json
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+COMMON_TASK_ARGS=&#60;same as those in <a href="#wikitext-perplexity-evaluation">WikiText Perplexity Evaluation</a> above&#62;
+
+python tasks/main.py \
+       --task $TASK \
+       $COMMON_TASK_ARGS \
+       --valid-data $VALID_DATA \
+       --tokenizer-type GPT2BPETokenizer \
+       --strict-lambada \
+       --merge-file $MERGE_FILE \
+       --load $CHECKPOINT_PATH \
+       --batch-size 8 \
+       --checkpoint-activations \
+       --log-interval 10 \
+       --no-load-optim \
+       --no-load-rng
+</pre>
+
+Further command line arguments are described in the source file [`main.py`](./tasks/main.py)
+
+<a id="bert-task-evaluation"></a>
+## BERT Task Evaluation
+<a id="race-evaluation"></a>
+### RACE Evaluation
+The following script finetunes the BERT model for evaluation on the [RACE dataset](http://www.cs.cmu.edu/~glai1/data/race/). The `TRAIN_DATA` and `VALID_DATA` directory contain the RACE dataset as separate `.txt` files.
 
-## Distributed BERT Pretraining
-`bash scripts/pretrain_bert_distributed.sh`
+<pre>
+TRAIN_DATA="data/RACE/train/middle"
+VALID_DATA="data/RACE/dev/middle \
+            data/RACE/dev/high"
+VOCAB_FILE=bert-vocab.txt
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+CHECKPOINT_PATH=checkpoints/bert_345m_race
+COMMON_TASK_ARGS="--num-layers 24 \
+                  --hidden-size 1024 \
+                  --num-attention-heads 16 \
+                  --seq-length 512 \
+                  --max-position-embeddings 512 \
+                  --fp16 \
+                  --vocab-file $VOCAB_FILE"
 
-To use this script, follow the same data preparation procedure as in [earlier sections](#bert-pretraining). This script uses the pytorch distributed launcher to launch distributed training. As such, multinode training can be achieved by properly setting environment variables for the `env://` init method. See the official pytorch [documentation](https://pytorch.org/docs/stable/distributed.html#launch-utility) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default multinode training uses the nccl distributed backend.
+COMMON_TASK_ARGS_EXT="--train-data $TRAIN_DATA \
+                      --valid-data $VALID_DATA \
+                      --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+                      --checkpoint-activations \
+                      --save-interval 10000 \
+                      --save $CHECKPOINT_PATH \
+                      --log-interval 100 \
+                      --eval-interval 1000 \
+                      --eval-iters 10 \
+                      --weight-decay 1.0e-1"
 
-## Distributed BERT Pretraining with TFRecords
-`bash scripts/pretrain_bert_tfrecords_distributed.sh`
+python tasks/main.py \
+       --task RACE \
+       $COMMON_TASK_ARGS \
+       $COMMON_TASK_ARGS_EXT \
+       --tokenizer-type BertWordPieceLowerCase \
+       --epochs 3 \
+       --batch-size 4 \
+       --lr 1.0e-5 \
+       --warmup 0.06
+</pre>
 
-This script takes advantage of TensorFlow BERT's [`create_pretraining.py`](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/create_pretraining_data.py) script to pre-cache the dataset in the TFRecord format. To convert the data to pytorch tensors we use a `TFRecordDataset` and tensorflow eager mode to turn the TFRecords into numpy matrices before loading them into pytorch gpu tensors. This greatly reduces the overhead of dataprocessing and speeds up training. Pass a whitespace-separated list of TFRecord paths to `--train-data` and enable the `--use-tfrecords` flag. Multinode training can be achieved as described in the [previous section](#distributed-bert-pretraining).
+<a id="mnli-evaluation"></a>
+### MNLI Evaluation
+The following script finetunes the BERT model for evaluation with the [MultiNLI sentence pair corpus](https://www.nyu.edu/projects/bowman/multinli/). Because the matching tasks are quite similar, the script can be quickly tweaked to work with the [Quora Question Pairs](https://www.kaggle.com/quora/question-pairs-dataset) (QQP) dataset as well.
 
-## Train Custom Sentence Piece Tokenizer and Pretrain BERT
-`bash scripts/pretrain_bert_sentencepiece.sh`
+<pre>
 
-This script runs BERT pretraining with a `sentencepiece` tokenizer. If no sentencepiece tokenizer exists at `--tokenizer-path` one will be trained automatically. The sentencepiece tokenizer can be used with the previous scripts (NOTE: sentencepiece training can only happen during single gpu pretraining). `<--tokenizer-path>.vocab` can be used with [`create_pretraining_data.py`](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/create_pretraining_data.py) to make a TFRecord dataset with the given tokenization.
+TRAIN_DATA="data/glue_data/MNLI/train.tsv"
+VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
+            data/glue_data/MNLI/dev_mismatched.tsv"
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m_mnli
+COMMON_TASK_ARGS=&#60;same as those in <a href="#race-evaluation">RACE Evaluation</a> above&#62;
+COMMON_TASK_ARGS_EXT=&#60;same as those in <a href="#race-evaluation">RACE Evaluation</a> above&#62;
 
+python tasks/main.py \
+       --task MNLI \
+       $COMMON_TASK_ARGS \
+       $COMMON_TASK_ARGS_EXT \
+       --tokenizer-type BertWordPieceLowerCase \
+       --epochs 5 \
+       --batch-size 8 \
+       --lr 5.0e-5 \
+       --warmup 0.065
+</pre>
 
-# Collecting Wikipedia Training Data
-We recommend following the wikipedia data extraction process specified by google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text." 
+<a id="datasets"></a>
+# Datasets
+We do not host any datasets for GPT-2 or BERT training, however, we detail their collection so that our results may be reproduced.
 
-We recommend using the `--json` argument when using WikiExtractor, which will dump the wikipedia data into loose json format (one json per line), making it more manageable and readily consumable by our codebase.
+<a id="collecting-wikipedia-training-data"></a>
+## Collecting Wikipedia Training Data
+We recommend following the Wikipedia data extraction process specified by Google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text."
 
-Once the json dataset is ready make sure to set the path in line 27 of `data_utils/corpora.py`.
+We recommend using the `--json` argument when using WikiExtractor, which will dump the Wikipedia data into loose json format (one json per line), making it more manageable on the file system and also readily consumable by our codebase. We recommend further preprocessing this json dataset by nltk punctuation standardization. For BERT training, add newlines between sentences during data preprocessing. This is done with the `--split-sentences` flag in `preprocess_data.py` as described [above](#data-preprocessing). (Note that if you'd like to use Wikipedia data for GPT-2 training you should still clean it with nltk/spacy/ftfy, but do not split it into newline separated sentences.)
 
-If your system is memory limited we also recommend running pretraining with the `--lazy-loader` argument as we've done. After preprocessing the dataset once, this will allow the dataset to be lazily loaded from disk, as opposed to storing it in memory.
->>>>>>> initial commit
+<a id="collecting-gpt-2-webtext-data"></a>
+## Collecting GPT-2 Webtext Data
+We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filtered, cleaned, and deduplicated all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content.
diff --git a/arguments.py b/arguments.py
deleted file mode 100644
index d7d554e6..00000000
--- a/arguments.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""argparser configuration"""
-
-import argparse
-import os
-import torch
-
-
-def add_model_config_args(parser):
-    """Model arguments"""
-
-    group = parser.add_argument_group('model', 'model configuration')
-
-    group.add_argument('--pretrained-bert', action='store_true',
-                       help='use a pretrained bert-large-uncased model instead'
-                       'of initializing from scratch. See '
-                       '--tokenizer-model-type to specify which pretrained '
-                       'BERT model to use')
-    group.add_argument('--attention-dropout', type=float, default=0.1,
-                       help='dropout probability for attention weights')
-    group.add_argument('--num-attention-heads', type=int, default=16,
-                       help='num of transformer attention heads')
-    group.add_argument('--hidden-size', type=int, default=1024,
-                       help='tansformer hidden size')
-    group.add_argument('--intermediate-size', type=int, default=None,
-                       help='transformer embedding dimension for FFN'
-                       'set to 4*`--hidden-size` if it is None')
-    group.add_argument('--num-layers', type=int, default=24,
-                       help='num decoder layers')
-    group.add_argument('--layernorm-epsilon', type=float, default=1e-12,
-                       help='layer norm epsilon')
-    group.add_argument('--hidden-dropout', type=float, default=0.0,
-                       help='dropout probability for hidden state transformer')
-    group.add_argument('--max-position-embeddings', type=int, default=512,
-                       help='maximum number of position embeddings to use')
-    group.add_argument('--vocab-size', type=int, default=30522,
-                       help='vocab size to use for non-character-level '
-                       'tokenization. This value will only be used when '
-                       'creating a tokenizer')
-
-    return parser
-
-
-def add_fp16_config_args(parser):
-    """Mixed precision arguments."""
-
-    group = parser.add_argument_group('fp16', 'fp16 configurations')
-
-    group.add_argument('--fp16', action='store_true',
-                       help='Run model in fp16 mode')
-    group.add_argument('--fp32-embedding', action='store_true',
-                       help='embedding in fp32')
-    group.add_argument('--fp32-layernorm', action='store_true',
-                       help='layer norm in fp32')
-    group.add_argument('--fp32-tokentypes', action='store_true',
-                       help='embedding token types in fp32')
-    group.add_argument('--fp32-allreduce', action='store_true',
-                       help='all-reduce in fp32')
-    group.add_argument('--hysteresis', type=int, default=2,
-                       help='hysteresis for dynamic loss scaling')
-    group.add_argument('--loss-scale', type=float, default=None,
-                       help='Static loss scaling, positive power of 2 '
-                       'values can improve fp16 convergence. If None, dynamic'
-                       'loss scaling is used.')
-    group.add_argument('--loss-scale-window', type=float, default=1000,
-                       help='Window over which to raise/lower dynamic scale')
-    group.add_argument('--min-scale', type=float, default=1,
-                       help='Minimum loss scale for dynamic loss scale')
-
-    return parser
-
-
-def add_training_args(parser):
-    """Training arguments."""
-
-    group = parser.add_argument_group('train', 'training configurations')
-
-    group.add_argument('--batch-size', type=int, default=4,
-                       help='Data Loader batch size')
-    group.add_argument('--weight-decay', type=float, default=0.01,
-                       help='weight decay coefficient for L2 regularization')
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='checkpoint activation to allow for training '
-                       'with larger models and sequences')
-    group.add_argument('--clip-grad', type=float, default=1.0,
-                       help='gradient clipping')
-    group.add_argument('--epochs', type=int, default=1,
-                       help='upper epoch limit')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='report interval')
-    group.add_argument('--train-iters', type=int, default=1000000,
-                       help='number of iterations per epoch')
-    group.add_argument('--seed', type=int, default=1234,
-                       help='random seed')
-    # Learning rate.
-    group.add_argument('--lr-decay-iters', type=int, default=None,
-                       help='number of iterations to decay LR over,'
-                       ' If None defaults to `--train-iters`*`--epochs`')
-    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine', 'exponential'],
-                       help='learning rate decay function')
-    group.add_argument('--lr', type=float, default=1.0e-4,
-                       help='initial learning rate')
-    group.add_argument('--warmup', type=float, default=0.01,
-                       help='percentage of data to warmup on (.01 = 1% of all '
-                       'training iters). Default 0.01')
-    # model checkpointing
-    group.add_argument('--save', type=str, default=None,
-                       help='Output directory to save checkpoints to.')
-    group.add_argument('--save-iters', type=int, default=None,
-                       help='Save every so often iterations.')
-    group.add_argument('--save-optim', action='store_true',
-                       help='Save current optimizer.')
-    group.add_argument('--save-rng', action='store_true',
-                       help='Save current rng state.')
-    group.add_argument('--save-all-rng', action='store_true',
-                       help='Save current rng state of each rank in '
-                       'distributed training.')
-    group.add_argument('--load', type=str, default=None,
-                       help='Path to a particular model checkpoint. \
-                             (ex. `savedir/model.1000.pt`)')
-    group.add_argument('--load-optim', action='store_true',
-                       help='Load most recent optimizer corresponding '
-                       'to `--load`.')
-    group.add_argument('--load-rng', action='store_true',
-                       help='Load most recent rng state corresponding '
-                       'to `--load`.')
-    group.add_argument('--load-all-rng', action='store_true',
-                       help='Load most recent rng state of each rank in '
-                       'distributed training corresponding to `--load`('
-                       'complementary to `--save-all-rng`).')
-    group.add_argument('--resume-dataloader', action='store_true',
-                       help='Resume the dataloader when resuming training. '
-                       'Does not apply to tfrecords dataloader, try resuming'
-                       'with a different seed in this case.')
-    # distributed training args
-    group.add_argument('--distributed-backend', default='nccl',
-                       help='which backend to use for distributed '
-                       'training. One of [gloo, nccl]')
-    group.add_argument('--local_rank', type=int, default=None,
-                       help='local rank passed from distributed launcher')
-
-    return parser
-
-
-def add_evaluation_args(parser):
-    """Evaluation arguments."""
-
-    group = parser.add_argument_group('validation', 'validation configurations')
-
-    group.add_argument('--eval-batch-size', type=int, default=None,
-                       help='Data Loader batch size for evaluation datasets.'
-                       'Defaults to `--batch-size`')
-    group.add_argument('--eval-iters', type=int, default=2000,
-                       help='number of iterations per epoch to run '
-                       'validation/test for')
-    group.add_argument('--eval-seq-length', type=int, default=None,
-                       help='Maximum sequence length to process for '
-                       'evaluation. Defaults to `--seq-length`')
-    group.add_argument('--eval-max-preds-per-seq', type=int, default=None,
-                       help='Maximum number of predictions to use for '
-                       'evaluation. Defaults to '
-                       'math.ceil(`--eval-seq-length`*.15/10)*10')
-
-    return parser
-
-
-def add_data_args(parser):
-    """Train/valid/test data arguments."""
-
-    group = parser.add_argument_group('data', 'data configurations')
-
-    group.add_argument('--train-data', nargs='+', required=True,
-                       help='Filename (or whitespace separated filenames) '
-                       'for training.')
-    group.add_argument('--delim', default=',',
-                       help='delimiter used to parse csv data files')
-    group.add_argument('--text-key', default='sentence',
-                       help='key to use to extract text from json/csv')
-    group.add_argument('--eval-text-key', default=None,
-                       help='key to use to extract text from '
-                       'json/csv evaluation datasets')
-    group.add_argument('--valid-data', nargs='*', default=None,
-                       help="""Filename for validation data.""")
-    group.add_argument('--split', default='1000,1,1',
-                       help='comma-separated list of proportions for training,'
-                       ' validation, and test split')
-    group.add_argument('--test-data', nargs='*', default=None,
-                       help="""Filename for testing""")
-
-    group.add_argument('--lazy-loader', action='store_true',
-                       help='whether to lazy read the data set')
-    group.add_argument('--loose-json', action='store_true',
-                       help='Use loose json (one json-formatted string per '
-                       'newline), instead of tight json (data file is one '
-                       'json string)')
-    group.add_argument('--num-workers', type=int, default=2,
-                       help="""Number of workers to use for dataloading""")
-    group.add_argument('--tokenizer-model-type', type=str,
-                       default='bert-large-uncased',
-                       help="Model type to use for sentencepiece tokenization \
-                       (one of ['bpe', 'char', 'unigram', 'word']) or \
-                       bert vocab to use for BertWordPieceTokenizer (one of \
-                       ['bert-large-uncased', 'bert-large-cased', etc.])")
-    group.add_argument('--tokenizer-path', type=str, default='tokenizer.model',
-                       help='path used to save/load sentencepiece tokenization '
-                       'models')
-    group.add_argument('--tokenizer-type', type=str,
-                       default='BertWordPieceTokenizer',
-                       choices=['CharacterLevelTokenizer',
-                                'SentencePieceTokenizer',
-                                'BertWordPieceTokenizer'],
-                       help='what type of tokenizer to use')
-    group.add_argument("--cache-dir", default=None, type=str,
-                       help="Where to store pre-trained BERT downloads")
-    group.add_argument('--use-tfrecords', action='store_true',
-                       help='load `--train-data`, `--valid-data`, '
-                       '`--test-data` from BERT tf records instead of '
-                       'normal data pipeline')
-    group.add_argument('--seq-length', type=int, default=512,
-                       help="Maximum sequence length to process")
-    group.add_argument('--max-preds-per-seq', type=int, default=None,
-                       help='Maximum number of predictions to use per sequence.'
-                       'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
-                       'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
-
-    return parser
-
-
-def print_args(args):
-    """Print arguments."""
-
-    print('arguments:', flush=True)
-    for arg in vars(args):
-        dots = '.' * (29 - len(arg))
-        print('  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
-
-
-def get_args():
-    """Parse all the args."""
-
-    parser = argparse.ArgumentParser(description='PyTorch BERT Model')
-    parser = add_model_config_args(parser)
-    parser = add_fp16_config_args(parser)
-    parser = add_training_args(parser)
-    parser = add_evaluation_args(parser)
-    parser = add_data_args(parser)
-
-    args = parser.parse_args()
-
-    args.cuda = torch.cuda.is_available()
-    args.rank = int(os.getenv('RANK', '0'))
-    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
-
-    args.dynamic_loss_scale = False
-    if args.loss_scale is None:
-        args.dynamic_loss_scale = True
-        print(' > using dynamic loss scaling')
-
-    # The args fp32_* or fp16_* meant to be active when the
-    # args fp16 is set. So the default behaviour should all
-    # be false.
-    if not args.fp16:
-        args.fp32_embedding = False
-        args.fp32_tokentypes = False
-        args.fp32_layernorm = False
-
-    print_args(args)
-    return args
diff --git a/data_utils/corpora.py b/data_utils/corpora.py
deleted file mode 100755
index 334f3518..00000000
--- a/data_utils/corpora.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""several datasets with preset arguments"""
-from .datasets import json_dataset, csv_dataset
-
-class wikipedia(json_dataset):
-	"""
-	dataset for wikipedia with arguments configured for convenience
-
-	command line usage: `--train-data wikipedia`
-	"""
-	PATH = '<wikipedia_path>'
-	assert_str = "make sure to set PATH at line 27 of data_utils/corpora.py"
-	def __init__(self, **kwargs):
-		assert wikipedia.PATH != '<wikipedia_path>', \
-                                         wikipedia.assert_str
-		if not kwargs:
-			kwargs = {}
-		kwargs['text_key'] = 'text'
-		kwargs['loose_json'] = True
-		super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
-
-NAMED_CORPORA = {
-	'wikipedia': wikipedia,
-}
diff --git a/examples/ds_config.json b/examples/ds_config.json
new file mode 100755
index 00000000..ba78ccf4
--- /dev/null
+++ b/examples/ds_config.json
@@ -0,0 +1,24 @@
+{
+  "train_batch_size": 256,
+  "train_micro_batch_size_per_gpu": 4,
+  "steps_per_print": 10,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0,
+      "betas": [0.9, 0.95]
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false
+}
diff --git a/examples/ds_pretrain_gpt2.sh b/examples/ds_pretrain_gpt2.sh
new file mode 100755
index 00000000..6087c725
--- /dev/null
+++ b/examples/ds_pretrain_gpt2.sh
@@ -0,0 +1,133 @@
+#! /bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+export DLWS_NUM_WORKER=${NNODES}
+export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
+
+DATA_PATH=data/webtext/webtext_text_document
+VOCAB_PATH=data/gpt2-vocab.json
+MERGE_PATH=data/gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+config_json="$script_dir/ds_zero_stage_2_config.json"
+
+# Megatron Model Parallelism
+mp_size=4
+
+NLAYERS=24
+NHIDDEN=1024
+BATCHSIZE=9
+LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${mp_size}mp_${BATCHSIZE}b_ds4"
+
+#ZeRO Configs
+stage=0
+reduce_scatter=true
+contigious_gradients=true
+rbs=50000000
+agbs=5000000000
+
+#Actication Checkpointing and Contigious Memory
+chkp_layers=1
+PA=true
+PA_CPU=false
+CC=true
+SYNCHRONIZE=true
+PROFILE=false
+
+
+gpt_options=" \
+        --model-parallel-size ${mp_size} \
+        --num-layers $NLAYERS \
+        --hidden-size $NHIDDEN \
+        --num-attention-heads 16 \
+        --seq-length 1024 \
+        --max-position-embeddings 1024 \
+        --batch-size $BATCHSIZE \
+        --train-iters 320000 \
+        --lr-decay-iters 320000 \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH \
+        --data-path $DATA_PATH \
+        --vocab-file $VOCAB_PATH \
+        --merge-file $MERGE_PATH \
+        --data-impl mmap \
+        --split 949,50,1 \
+        --distributed-backend nccl \
+        --lr 1.5e-4 \
+        --lr-decay-style cosine \
+        --min-lr 1.0e-5 \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --warmup 0.01 \
+        --checkpoint-activations \
+        --log-interval 100 \
+        --save-interval 10000 \
+        --eval-interval 1000 \
+        --eval-iters 10 \
+        --fp16 \
+        --tensorboard-dir ${LOGDIR}
+"
+  
+ deepspeed_options=" \
+                --deepspeed \
+                --deepspeed_config ${config_json} \
+                --zero-stage ${stage} \
+                --zero-reduce-bucket-size ${rbs} \
+                --zero-allgather-bucket-size ${agbs} 
+            "
+
+if [ "${contigious_gradients}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+                --zero-contigious-gradients"
+fi
+
+if [ "${reduce_scatter}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+                --zero-reduce-scatter"
+fi
+
+chkp_opt=" \
+--checkpoint-activations \
+--checkpoint-num-layers ${chkp_layers}"
+
+if [ "${PA}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --partition-activations"
+fi
+
+if [ "${PA_CPU}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --checkpoint-in-cpu"
+fi
+
+if [ "${SYNCHRONIZE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --synchronize-each-layer"
+fi
+
+if [ "${CC}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --contigious-checkpointing"
+fi
+
+if [ "${PROFILE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --profile-backward"
+fi
+
+full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
+
+run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
diff --git a/examples/ds_pretrain_gpt2_pipe.sh b/examples/ds_pretrain_gpt2_pipe.sh
new file mode 100755
index 00000000..ed60d018
--- /dev/null
+++ b/examples/ds_pretrain_gpt2_pipe.sh
@@ -0,0 +1,140 @@
+#! /bin/bash
+
+GPUS_PER_NODE=16
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+export DLWS_NUM_WORKER=${NNODES}
+export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
+
+DATA_PATH=data/webtext/webtext_text_document
+VOCAB_PATH=data/gpt2-vocab.json
+MERGE_PATH=data/gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+#config_json="$script_dir/ds_zero_stage_2_config.json"
+config_json="$script_dir/ds_config.json"
+
+# Megatron Model Parallelism
+mp_size=2
+# DeepSpeed Pipeline parallelism
+pp_size=2
+
+NLAYERS=24
+NHIDDEN=1024
+BATCHSIZE=4
+LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
+
+GAS=16
+
+#ZeRO Configs
+stage=0
+reduce_scatter=true
+contigious_gradients=true
+rbs=50000000
+agbs=5000000000
+
+#Actication Checkpointing and Contigious Memory
+chkp_layers=1
+PA=true
+PA_CPU=false
+CC=true
+SYNCHRONIZE=true
+PROFILE=false
+
+
+gpt_options=" \
+        --model-parallel-size ${mp_size} \
+        --pipe-parallel-size ${pp_size} \
+        --num-layers $NLAYERS \
+        --hidden-size $NHIDDEN \
+        --num-attention-heads 16 \
+        --seq-length 1024 \
+        --max-position-embeddings 1024 \
+        --batch-size $BATCHSIZE \
+        --gas $GAS \
+        --train-iters 320000 \
+        --lr-decay-iters 320000 \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH \
+        --data-path $DATA_PATH \
+        --vocab-file $VOCAB_PATH \
+        --merge-file $MERGE_PATH \
+        --data-impl mmap \
+        --split 949,50,1 \
+        --distributed-backend nccl \
+        --lr 1.5e-4 \
+        --lr-decay-style cosine \
+        --min-lr 1.0e-5 \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --warmup 0.01 \
+        --checkpoint-activations \
+        --log-interval 1 \
+        --save-interval 500 \
+        --eval-interval 100 \
+        --eval-iters 10 \
+        --fp16 \
+        --tensorboard-dir ${LOGDIR}
+"
+  
+ deepspeed_options=" \
+                --deepspeed \
+                --deepspeed_config ${config_json} \
+                --zero-stage ${stage} \
+                --zero-reduce-bucket-size ${rbs} \
+                --zero-allgather-bucket-size ${agbs} 
+            "
+
+if [ "${contigious_gradients}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+                --zero-contigious-gradients"
+fi
+
+if [ "${reduce_scatter}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+                --zero-reduce-scatter"
+fi
+
+chkp_opt=" \
+--checkpoint-activations \
+--checkpoint-num-layers ${chkp_layers}"
+
+if [ "${PA}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --partition-activations"
+fi
+
+if [ "${PA_CPU}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --checkpoint-in-cpu"
+fi
+
+if [ "${SYNCHRONIZE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --synchronize-each-layer"
+fi
+
+if [ "${CC}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --contigious-checkpointing"
+fi
+
+if [ "${PROFILE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+        --profile-backward"
+fi
+
+full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
+
+run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
diff --git a/examples/ds_zero_stage_2_config.json b/examples/ds_zero_stage_2_config.json
new file mode 100755
index 00000000..2ab86c24
--- /dev/null
+++ b/examples/ds_zero_stage_2_config.json
@@ -0,0 +1,32 @@
+{
+  "train_batch_size": 2048,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "reduce_scatter": true,
+    "allgather_bucket_size": 50000000,
+    "reduce_bucket_size": 50000000,
+    "overlap_comm": true
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0,
+      "betas": [0.9, 0.95]
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false
+}
diff --git a/examples/evaluate_zeroshot_gpt2.sh b/examples/evaluate_zeroshot_gpt2.sh
new file mode 100755
index 00000000..f4f9f22f
--- /dev/null
+++ b/examples/evaluate_zeroshot_gpt2.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TASK="LAMBADA"
+
+VALID_DATA=<lambada path>
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+CHECKPOINT=checkpoints/gpt2_345m
+
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task $TASK \
+               --valid-data $VALID_DATA \
+               --tokenizer-type GPT2BPETokenizer \
+               --strict-lambada \
+               --vocab-file $VOCAB_FILE \
+               --merge-file $MERGE_FILE \
+               --load $CHECKPOINT \
+               --model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 8 \
+               --checkpoint-activations \
+               --seq-length 1024 \
+               --max-position-embeddings 1024 \
+               --log-interval 10 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
diff --git a/examples/finetune_mnli_distributed.sh b/examples/finetune_mnli_distributed.sh
new file mode 100755
index 00000000..65f3a9f3
--- /dev/null
+++ b/examples/finetune_mnli_distributed.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TRAIN_DATA="data/glue_data/MNLI/train.tsv"
+VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
+            data/glue_data/MNLI/dev_mismatched.tsv"
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m_mnli
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task MNLI \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 5 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 8 \
+               --checkpoint-activations \
+               --lr 5.0e-5 \
+               --lr-decay-style linear \
+               --warmup 0.065 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 500000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --fp16
diff --git a/examples/finetune_race_distributed.sh b/examples/finetune_race_distributed.sh
new file mode 100755
index 00000000..0212ecba
--- /dev/null
+++ b/examples/finetune_race_distributed.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TRAIN_DATA="data/RACE/train/middle"
+VALID_DATA="data/RACE/dev/middle \
+            data/RACE/dev/high"
+VOCAB_FILE=bert-vocab.txt
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+CHECKPOINT_PATH=checkpoints/bert_345m_race
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task RACE \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 3 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 4 \
+               --checkpoint-activations \
+               --lr 1.0e-5 \
+               --lr-decay-style linear \
+               --warmup 0.06 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 100000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --clip-grad 1.0 \
+               --hidden-dropout 0.1 \
+               --attention-dropout 0.1 \
+               --fp16
diff --git a/examples/generate_text.sh b/examples/generate_text.sh
new file mode 100755
index 00000000..6a04c492
--- /dev/null
+++ b/examples/generate_text.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+
+python tools/generate_samples_gpt2.py \
+       --model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --load $CHECKPOINT_PATH \
+       --num-attention-heads 16 \
+       --max-position-embeddings 1024 \
+       --tokenizer-type GPT2BPETokenizer \
+       --fp16 \
+       --batch-size 2 \
+       --seq-length 1024 \
+       --out-seq-length 1024 \
+       --temperature 1.0 \
+       --vocab-file $VOCAB_FILE \
+       --merge-file $MERGE_FILE \
+       --genfile unconditional_samples.json \
+       --num-samples 2 \
+       --top_p 0.9 \
+       --recompute
diff --git a/examples/merge_mp_bert.sh b/examples/merge_mp_bert.sh
new file mode 100755
index 00000000..01e08b12
--- /dev/null
+++ b/examples/merge_mp_bert.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+MODEL_PARALLEL_SIZE=2
+
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m
+
+WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
+                                --model-type BERT \
+                                --model-parallel-size $MODEL_PARALLEL_SIZE \
+                                --tokenizer-type BertWordPieceLowerCase \
+                                --vocab-file $VOCAB_FILE \
+                                --num-layers 24 \
+                                --hidden-size 1024 \
+                                --num-attention-heads 16 \
+                                --seq-length 512 \
+                                --max-position-embeddings 512 \
+                                --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_bert.sh b/examples/pretrain_bert.sh
new file mode 100755
index 00000000..ecf59477
--- /dev/null
+++ b/examples/pretrain_bert.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --batch-size 4 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 2000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
+
diff --git a/examples/pretrain_bert_distributed.sh b/examples/pretrain_bert_distributed.sh
new file mode 100755
index 00000000..17ebae1f
--- /dev/null
+++ b/examples/pretrain_bert_distributed.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --batch-size 4 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
diff --git a/examples/pretrain_gpt2.sh b/examples/pretrain_gpt2.sh
new file mode 100755
index 00000000..66232bf5
--- /dev/null
+++ b/examples/pretrain_gpt2.sh
@@ -0,0 +1,43 @@
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_gpt2.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
+
+
+set +x
diff --git a/examples/pretrain_gpt2_distributed.sh b/examples/pretrain_gpt2_distributed.sh
new file mode 100755
index 00000000..1d746250
--- /dev/null
+++ b/examples/pretrain_gpt2_distributed.sh
@@ -0,0 +1,52 @@
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt2.py \
+       --model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --warmup .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
+
+
+
+set +x
diff --git a/images/Makefile b/images/Makefile
new file mode 100644
index 00000000..5efde029
--- /dev/null
+++ b/images/Makefile
@@ -0,0 +1,12 @@
+default: cases.png scaling-mp.png scaling-dp.png
+
+# for some reason the size option to convert in scaling.tex doesn't work, manually do it after
+cases.png scaling-mp.png scaling-dp.png: tables.tex
+	latex --shell-escape $<
+	convert tables-1.png -resize 650 cases.png
+	convert tables-2.png -resize 600 scaling-mp.png
+	convert tables-3.png -resize 350 scaling-dp.png
+
+clean:
+	rm -rf *.aux *.log *.dvi *.ps
+	rm -rf tables-*.png
diff --git a/images/cases.png b/images/cases.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f52c38f544c4e7b52d21037b19d8255ee5eb19b
GIT binary patch
literal 11807
zcmZ`<MN}M2kRIIK;Rl9b!QGwU?i$?P-3FK7PH=a3cMG223>w^FaNX>24}0h?>91bD
zu2)@ksESrml14)&Mg{-?XtFXAY5)Lq%s*ZS@!tl}_KY_8M_{eQ6vY65#sri%6S#je
z&`d^65diR|0RTW@0Km&X73dfM@L&Z1PK^No{!9RXz&WQ&Rq)>dw5hzb1mH6=VY>gH
zgybxv>jnTYQ~vLQ@-7we{HH{4msONR*hfOaC4#fl6b%9Z7*Av+#5BCuF9N*0(?)MU
ztsaD^g@We=3Lg#OU~NKlRItuN4h*O6t5&w^dZF-fRT@AcUle_lM86h><I5pZl@O@O
z3Qf{oUA}$r$|A~YKp)*_Z@Xn31*fQQWz9{`UJ7(`TrsmTCp$vpVlhYR&|!-=Vu!d>
z06@AO{pv-BcJ4Spt_Dtn^r)#2idN#ACN=RywVI`Zx%}e^IixNH9F2o2AfVJ>VU){_
zvAviXHgo`1RQ$D=e8t);-Tn(eMeV!mfh2?TD7l;s2zxJq3M7`c4Gn0<g{>n>S37X1
z?XU#nNe~z$AwBr#gBaNLve*y!U1l!D!l@PM86ZcD%_N;hVijxMX@EcENYxP+$rv9M
z&E6r*`H%VUYSH;pEqzf2NFBM23Ma%Z-`RWa$=S@Sn9qywGOL<W^=1^Ru1%`!x2gy<
z-N{%HgTh|F*f?qKy?#JhrE+)<TEDZzRUVK0q?U&0N7NZ6ef=4LwU`)PdA)5UI547d
z`)dPZrQ#PtB&i+xhcNxY2GqI{lCfOZt||^rW)u=v7fUeRRkh*rDLs35y{_xnn(@Ui
zw91b`%|7kDC)kw&Vh?lvmWT4=664Q)*DK<>76+X+23p(g*+FrG!_hZd%k=e+lE(2%
z+yHl|vggCH6Xn`m|A&u`q?Q|rc4D~1ZGykm9OnE@*DFwg3ThjI483Vrqfm(tu}<rn
z$fD71k7IF*a3IoZ?IWrD??h_#@JI^fmyAT2Z~M#Hr2s(dM?6*>9-WKrO!EE{N43F4
z)khv7Zm&INW|Z#v>I#Gfb=ab0b?L~*QVFRiKUa?4(vrH=TuaT>!xjH^`asQ)zNr+&
zsOd-AkAAzsn6w|-2ySZ@a*b*+Sy>)iE+QO{*+X{Dd8KM{bcgJ+OB16a6tNy!|7DNI
z7^pI4>B?!2rDWx*p6`Y7&NQ97=~Dbi<BxN0gSa`Y8G7o5e?8bc%d;xi@3eL0Xow-5
zCJ0aaJmL;F)pX3Lg5TBVcL_Y99X!(Ml1ziDy_0eYx4>hFcTM{=_|T`Dx=?*GT7K?&
zTr0%o=pFBQRXQZkHznJMyl>*JY^Y0C>-64ut<esp_CXUirEP`~)kD&1ft|zJ=caUN
z`~RQ>9P;SWRlV-gy^=K<vWXV-GM2d_exW|E8q2z}qs)C)0`iugtD${IL;D|(sXVaq
zqjD~Rr~=;T?uU1{p4TmHj^I-@FaetGLIN%sdQPRg_K#OQT%=^s7YbM^q`@VF)$@%;
zQrLA5d!T}D!XVXT*n==omZs&=?3fBRG%8xAlf}mh8JYD@nEot}iI{{$BGs?{-c*IA
zmQOEZ*7#MZc__pN>=3FbQCPVDK+y<!SI{AtC8E#-7OL6H4x-lESTtJVDp_i%$L8WB
zP&BIFWJb!yNX3oeL9QmSm`P!nDpZIgRb%pS{5)a(D=05N<~JWGO}}WRZK-7SCBw=N
z8R85igIzOa#1O?$MaPTEoGC;H#AA})1?PW}Tbscx+eenkAXc#Fq0&&usGEJ^Xt8A)
zJ$|@R%cPF?y7~?4h)0-6H4lA2<M8)@n|Qtbu3b+`(I1)^Y4E&_j^UA{g$<_mU1iIb
zX-aT#odRIUwEgznFdq&?LWE{x&Iy52wy@^kQ!^PW`sN5Oq3;n-s@T^*pwZM@PnZ+S
z;shI!FW=2;pfS!~#0sO-I!kMy!BmIjnm}y?LlcPEzDzO-0J(DHSSY@xFrC9zuA<7t
zG@-zO*sc{}iy*Ki*?4Iqd2ut0Vd>36Z8XwWIks=VZ7o7#^YRmc21b#VWuTF0)@&_*
zZgMs&;5o$0oJPHjCJVEXM;*w7?o-bGtCR0PSd>)Kn2}0SNpXVb4Kq$tZVL%|R>mv=
zrTiM2EI=d-D&ULfD;(OI?4+bj%+zAT6+ptXrkW~O;#d4}1u<~LLq<vzlaa3wiI`V7
z?zfz*c{q*jPKSn^Kw7;;{_pURbi$u?e9QF;asZpg!nL22+}X(`4UyzRs-mcw-kVg`
z^8qe3GR0p6pwN+c2}?Wiza|lRiX*?S{TCy^6iw&rb;p-Xhz+l4Pz*7<&=;*cuj>S1
zN1H4FBhp>JBSpyOpfYd|s|IFmR(_Kf1r%Hf%Y{K)Cgx~%*YRXac8muK3tC#{%?o$F
zYJ({#b`gv+zjz67;^s0X$rkPZnV4oQ(amzT94}c<v<b)VG)Yq0jhZY9`&zE@#nQ@M
zVboqygODA1Mil7oPb9*0{4$eWFj~GI#D}sivgR_Low&3qS#mgfNP`4yJrlQ%yi|@u
zBbYBLqaL|3rArt2&vl2ov0<*uO%;b776y85JUSZUe>whh`R3`gw<lI?itY9Oa!R=P
zbQ&+dmDs3>H0FVPCMKgT?gS~TVH0c9R4QiWN3mzAQx9od+z}2t*7_~g<jO5cP#=C$
z=r*6r?(TsFIO^}*IKGuNYiAjm-Rr8&pNDpS``2>lfH#}nU_zX>8mD#$T~#3g2+%Mx
zT}x`1oe=9%Jo|U*=4e*21%ET4N1x_0|7q%ylvJxvFGUiO_?^ni?1DB|TNp-f<>Ig6
zX2ZiD=6*p8y+3D1%|~TUh(bYDZ37%45B^rg9&X>|h_5Fi99b?K1GnDCoyPE{k(#!>
z$WWz}dJh$@UjnQKIi?b~Ufn>5KzJc<l93OVjS>;Fc7Jwn{oj4r_IDnp*^hysupnMr
zm5HVELS4+G5NNzqop-xPvwDEHg{qi$G7+`9QkyT;;hv@hjn1AH<4|XL$n17n#Om%N
z*?&DOkm#Kfn>EYC4gSNGqq`T==u)*eX5&$k&&w}HQBF_u_^V_j#4k^fmlv?x^HY^u
zG(Ks%R~_yrF$@-8^Fg>3q{i*lL5`Q3LffBw^Rk)mU#7!4=?QzU;b5>N;}2DaZD`i#
zgdq2-qPk5{GFFcQH;p!0Y0g7E&-X{K*u7RKddf)M>Zzx@vm5OLWBJT!69Rv9{++Ag
zXr5i;;1?5^5_|`;2+xV!9kjqdPsgcuM0nqj9Ns)O<Z@&)cwT2evO}iY*@Jv3-Ojex
zh@2ygd=u9Om7Vrpg{|7|C-+8$vbG7wl`OTs&mV7WuUzH|r$6_yO>d(gNW(W=;*1wM
zWclnt1P}cV@u;~ZDr?cZJRCS9gU(0Z{DKbCk_4X^4w0@P(=R<|ixo4|{fn5JAM?JO
zb10TpY}-DRR<GZU9Y63t2in2ED*HyKF@%Q%U$&>!wXP4jr^~Z%s-UKOS_Is5XxmK8
zPTKgqSjhz^lRdWPL(_gZ<;>dKbth!>|N0LCbUSvN&hm1$#UOx;E3-bn1b3Q*6`RpO
z<UDWJ3zvRIQ(whKb6MQ5uM}#<3?pzrhMS+dx2C~P=>6QmK9^s&ZvX4f)OC@2hnos|
zyT*jD8y&VyWs+YGS3?5PjU5{!o>$tlltm^GvRSRT>GrI^jYa!`3H7f&hjND~t8>cW
z**5{?VxF7h`Yh`0tRe6RD?{7n7@W{;W`AeGbJyB56OUWD+jPH{D7WU*eMv$u0EFDU
zekIf()KkbQ0{-`hR{Eyo_?#|lS6F?Ii=_Y`C=>-)%lpdiS?`3qNb0HAP=Sh5V6zjt
zg*VHA(|8F^&L{MfP$NUM>VX29NOI-jx1v8vmv;5sGunI8bE<kDjcQJ~ze0dGaycq{
zXOl^uy-ert@Wlgjv1+s>NR})$Ie>2E$)=#rG5~u)O><L6DTOmTdZxghcXTZWejYga
z)Nbg=Om&}pz}w~b%5dKON2n$(F0H3B+BV)YVSJ<V2RGkqJO=`qpt0jg0+@sq`ZJT)
z|EZaaQL*Fa);TAVjKk={b}<Z`aVHjr$wiXG_*oM6{TNK$tf+GETyFw_<)c(Idig=~
z08TjFrgPzmN<CXZzoXAEC5$d$o|;fa$OPih;e)8D7nMd+qr`Jk7MMT%aLP;ERo?_&
zGaq^yHyMg*<+7oSdA)nNCC!+Wm3y6xR=diBd~$dp$GrO^L1?7bNh#8nk(p9RI;Q6p
z)g$dCD#=xNp;BBP6XSj#=(`O6q>RTWcx1Ziu6T;dzA#kLsQ>Ekr9lAf?hiVeotGbk
zWdU@d!DbMV?D?l2KZ-j#pCN(T2ksvJN{eVxeAnoC7%@CfjN*%xY~F}-vwdmC>J<*^
zW$Ou5ID=nqG1UYJ+!CSEsZPj=vLdg^VuCPwP8Xbb2F#&S{SXh0N`{xt9}Ohu@%Yn}
zK57G-@()Qc8!7|14zc?MpC(#Zn$rK3cn>J(3GI&}vTY%Z`Dq{>1t>)AxM<=MEu;&u
ziL!I6M9GKYi3@X$cYnNm)c{#4$PSla`zDGhHLi@>-5kO7WfB?L=JjSF`D54~)7xCq
zT`1ES-81loCG*00y*!Vo@2!y7F&uk7Oe5^xiKjCf4}_g+dhf?XBZqIy08$&E*`<Z<
z;dlC1pn-|dGof0+8g$<SRqIp1xZjXL^68DZD$j}7DplwC@z!vFHZ??!FsdH%<&*xa
z!SIQWpWjva+016LFlEwt9iTy`t9Y<sA@y41Uj9E~DvQlSbY;<Gm5;V@?GYsC<gh+l
zF4BPr1%xWdfkWHnqhnHEQ51+!b#!Ql<h!NrzQRu+W#GW>%N=85ilV2YSmn}y<n4hd
z1)z+F$INF}NpJm2SSlF&eu~<8OuqT&5imE6-NLVN1pnlK4(-3QLI~zG8oBxnAZ-|H
z>ti?^=u<Po|A<{{2~8WZZl^mMO}Z~(+~$Th$gj}>Ffr-0Ix`cbml&VM#$zoc4&&t4
zr)%X-YI<CUmMxiEW;+GZA~0QKDm{3m(0<5Qzx6R}TnG(?4G|mQ2PhhNumZO}dU1@y
zeTYw=p#z02gw?j!3IFkc6oXp$cHt1ee4=yeyH%vgYe^+w+4AevLffw9uz$oM9)Kgh
zE$Z`0zzBVsKklTPXDjNV7n&dSh7Yw_MB(%@{%X`&L32I~$YNcSG!S}q*ODnWOhG(U
zrTkg(W9@qjj&LBz3xG)&RoQd_svC!j=2+nPuNJ{WaO~qyiZ!U=BN{KCIRciVgbx5j
z+j8EF8~_66F%Yj2phKN8=y0&dRzmYd!b)ZLC|81hc4u&nv{VQ?s8(#FV&Kmb#YAYK
zko^kf!M+mxG=BoJ0NPLgy(UH!6qNOVRx@Z^DEn4?*6R!xHvAu~@ffJP;5EE^6v3F-
z<9HilPvZ|W0I<8RL!e0+X*VGsS^_R(9iu4@n2eU0J|V!Mi`0BG#w>fnN1-y<Mh^S{
zwVHgqhgw<Y($g84)e$Cez6Y<h+p|pQ|A$%B<)EZw{Z>7<?!o5gWY71Qj#G}IF*C?&
zpyRd|39F>%eVAdt(MHWjK@Rq$u{p{zyFXMC$Z8Ry19v^u6))@tP6}{$nZ`Q=M-NW;
zvkP)tEO11jWJh`}3+28HE|gux3;2CuZy<(D(^}DdW3k6>Yiam)<|Ex@Vk@B$eMPI=
ztmegwh2Qq=J+BAv&qID&xcRvrNKrbTM)=maJfv11LTZ9=JB_L`rafs?R?RVm5rG%D
z=g;C{h2<MwF6sU~_qu3KfR47f?eBg$67B0l4Lc*d1xws!vhRLq&^^SCh}h~#5gJXJ
z0FJ-W_HgVeE>LF6Q|Bp+DOP}=z2r2C-Yk$0`=_#k=p1P!?FO@#g=L_RSf+1SnN+)8
z9df&adyWWhm6g+`o}k6v-F}otY;S`A(^DSVr_x%6;<%a!gXqU}N%JkYHRka}8WW+g
zj}=aS9%qB|crs(B$?MwC%m%xBo8vm$d94>m8*_GOu>kjEs0+zreYdHn5(Sdaw^j<(
ztfG8>fh{iyyAhTs1KWI;qZM6w)2;Ehkxn`UGjX=^>*7V$$Ue3^K`6`<BkH+HAZT8I
z`9h_AMmPQr@2KY3jr+p59gVLvyUp<HS_kBOV!Vv%zi6}x#7&JVHubHsiBT6+OmRlq
z;xpR+e*NI<NkdW(ioQYNZ+zD`2EYBSZSa|Txy%pn4qSO-!NO}(6(MR}Xti{k&YKiI
zFzEXIpBm3-(84b-9k0rRbI2v5Sp;nb=XN1<q$?u~bQSnp)?R|oMrfpz`P1qPzuEcw
zIz`WF`JY{oi972``4rpK{dyxL{X=X+zQgFnhZvlQw90G!Hwom|#gC5LMI)6S;_BD_
ztz$-T@kcfPNafdsYD%TDsNaSZHj(;|T6@MypJNS9_th&rLH&-GUQHwJy^kL#FvJ&=
zUh$~2{HC^G99ry72V<(%i@_0h!Q~bF-wxW*YkURaZp}t2BCG8_t=27GIT?ME<w-#!
zfYS2cAdqeHoP-aOQx+cYZ`#Wi?f%7{v*4K-zoy^G3D0eIBtdnHc8vK8$xNZuArWTE
zjcaTTa5(C!nO^$+!#rfn(^qRmaNRu{K9cbQnNz*UI2GH^oTu|}`U=at1C>vg6a!K$
zSB0~^vms|c$ERYG(J2%_j=pd4Y-$33E>%p5`*=y6&d3|c#Rid@WY3BpB8D657rDUg
z-u(u52{RmzU)!u;(IXsvukX<{RoO`6(gFmIUhVui{N<#U#?dDl6ZKpwj8Jkq>N*3e
zja2$qt*b;*0skr8wYFZ(z#5YvJgoD7q1{?`=50tj_1Na8|BI@}EsVUqd#J)~r0jRw
zDUSe8O?Kl6i^c&X_utGlG4?`(O;}yRJ(RdlM_Y5MBj<)EJZiu50G2llI$lQveEK<y
z*k=i&dcCPt*R~3M<aZyw;amyxuEmRz2;!#FB@Hb-3!2N{k#atsRbNVPHMyvx8L+R^
zU@|j*RWmVvQ7t;pP=f(VTIi<f%3XYEr~kRg*vUSP7CbBxpcKtcw4VQ6ulbso<I;k>
zYYIihC7gt~Tm2G?!7{zlU9~ypOt(O9R@)Zq$DyV&8HAeX?KUj%8G(!IvNY@9r;Ke{
zpw>x7P$G1_=^{NLLtn4Cg(4jkA_xz**_ahclug3JxCVe712u>$rlL?HBgz&Qa@eP!
z<&P$s1P$(=$1UEwI^EGEk`_%g#Ri1|8=#T2Kq?@3B(WHZC`}>;S6Tz`f1={hct*|8
zNxuh>VN2O;HRRLQhU2y^eEtqpqxrI?iJ8J#tL*%Y@d6M1vj0}XIcnhwms+;!faHbz
zZOB1a%=wcEN(_02U7^dy@UH-FR2D35Gd!Glc!VQNntsNhL`j}+nHkz_0U;IPej5!{
zL7G_D?+S29TCG+YS`xN%(BHse$D4U@;?#Qxp2~7m428y;6WD<U3a#dsg~oUQ7Hg+-
z+(%doM<7!!eEQ8B7Te9rpMd=XZHuq+uxJf_buX7!<-`!Qm1E-fcZs=ABg%?D64Q4<
zfAphu7v-F5St{(z<tiVrLJLv_ett}K4G~X}YH(zNlfncFjKAwTxYtTxH(NQC)%*(0
znyPi|hpxiiNs~3<p4ZW~Nt&whuM;xp7Bed57cZk<v*0e?7k7){XL=7yU+D{S_~ywS
znzG&jY<YSkv!u<!Nr<hwGX3$_Srmg1=d+oJzBBH}>SaPrY#uX)u)b;Ah2w7mB`6D7
zwCn0os=eDm6f~rn7FWsN0q@kZb`K|gAD;v%rTr3iQZ|k#7<x}?ml=tBVKV_*jlZHf
z1KflkR8n-g7U4dL=w?^>USK@w18*;2K4-0joBeMTuQ-l`-OTTcpKEMK&s`<+wta@B
z&-aH4$h=U&YVrd?vz^xOO;)rAO={;ki57^y%yHf(3O+8sZk{(?bx(Ti7fx5=n|k=i
z;1l;om;!<<GT6Y?GatR{Rxs?m9r`jjJ3lJb^%rlWsnlP;(3bc-q`UFnkNn0tGaN6N
z4R!M7DRJ6%xSC#`k-DS#hd=-g+0-&mJ2KT%_0(n|a*#W6wL(NP<bU_A5}CuggY=&E
zQ!}&{oqV^TZC(zV5d}%X;}g<@b*VKMZm1~u)fMvKk^oB55@WBSXsL;6Qh~jf8T~VW
z78o6M4V22G)~Vi~-*9VLu&qb98$B{Oqc0CA#pq@Klae>FgMklKU+4C-rD6vk9i{<>
zmFq&$BCc8w^+65T^9lbnjZ@)g7dE1}&b3gVr#z$4dXyA~&KjIE&{iIJt~%8x%b#O0
z+ILM44*b&?QKEz{O$TF!a%((7747b2)BQZXEO{n)6b&rPln!o~#owphQZb(=9R1p9
zooXsMKP$=C?oV#URWn&XMupAHUV;^TYEbWg$91tB(|*@c`(VAv;?NGo##-2p;?$n0
zY7bwKEp$wq5kmzisF4lEIL|Bfv*)C6M4@6?gSYQ|7F=B7c*D6)i`52fkd9LJ|F<^?
z_3?@FGTe7b90X|@ZHF6)Vz7Y@UJKm7X&*$-etvqemN6Z$)v$vGMCtcp+WVf+W=y)}
zrc?cGT%vUOI<ZpoOvFX3LCxYGV)SbDL1y9+W$P^InGHtTWNSUC`AJ6m;${j*k$pL-
z7&Oo?;IH2Q-O3~#A2?~Db2p4CyFCNPsEZ(Ud|k6(_t~*_@}zaX<!BTzRA4Se-Gd6p
zy9-E+=I6B&(<Ke3&=^Kr-yjEizlw4k<vJPq4GISutLv>BJe4`Qt@gd*S4I!(m9-39
z)&=Hu`)FaEpcsphi*g@dEzrCjmKCo0LaM3zewF)y#2j|6JZ_Y$MdlZ_Q3~KyJg(s{
z9?;Dy^twB-m9Lg3@%-3{!^_7#{m@%E5s#$FoR4+5@c4F7cFtQNmE&WjfAjJ|Q9&e&
z={W>G15iDz*iYU+GcMUEk_~-q?qhF6iaj#4a?}<_GR0K4anefU6@CIruN2lxyZwzm
zi0j;%QL#Lij8lpq2)XjBlAp}@n<<pfgsFYa98<LRG^P+fUEQcOI)yLVQCqVAOKs)^
zo&;l3O=oy23K4{OJe1&jku<4|rFPN4T+kwz1`n^@ktN)I2#z|^69tt|`CF>TKi=WD
zZpj|8Pc%NH)`(Q%;B#h&UHFl1@t|7x(pGpMPyBPJ34ix)$nv{ZH3hi(u%KFN)o53=
z4AwQ2W%-Q#$CsJ$sPrDhEF?+GdOgqFDUC7Ueg?g2WikGXt;amhr|+M(>LS|%%*y~A
zT{>(PZI!HHIV>Av{Ap-LsD@}D7ojDXJqdmyF=jyA?Z@@bSag~!9exO~P>ha-B35NY
zegB`gy^X4;JEc6rP7h-Sxn9Uih_$rA$}3O<X`0>X;~AQsMO>Wo74%bXv0`b4v4e)u
z?Iyn7=(Q&ERdF$>A+uCrn3prRwkUWKU9XySqQSrLC*%GEr*$W*1PkWOn_$YqQzfy}
zYmP|x>ZVd{a{oonUg9&wjMba2{P1rUM8YX0jpBEqat#XB(obRCb+1H1rI#THNlUaU
zpaUpy7f*i^XMhK&=0q_3QvZl%*YzO|Zhhn3Rh><XQcG66j54qpZ502SsL*cb!5-n|
z7b!N+d^w5+Q2FAwne-eUz)Y23x3TQ<F`GEsq_se$KKb(PM^Pjj={D)}O@Xib+p8tY
z4}J!<y<_daw&Q0_l83ZCN&GSh0bZ;Q2F>sp-8c^xm^XFbHUhgOqwyjc-SoNcfJbsI
zesk$*-vY9Lt%@5keaMdR8*M4_4+-p2>zYZ}d}kOzx!Ws1Yt3KoDERck;WN{lZ5)tD
z4V9^Ni&oZzAJ=RwCu4%8)R3n?Ion*(rT<zTO@A_f?=425J2mGHi<k-DfDr6z4Ft{L
z-QFYlJUJoh6jB;4@`hfCJ_<(ZHN0LWDU`k(7@6aAPg@nHC)I^TaQxC{p=|Taq~*A?
zg1-N4I=<k}JA56i^)J&}AfS9)n7Q`5m*=!|C8O|svpiEHK5j*2cBzQ7R5(@!`->VO
zg<snjM2=n#ieZr=THBA0!xThFkkZa=7ZEx2Ao&M!>V1t?7RWSsSMw~eqxearH|icv
zg#}q<bMJF#V?*;fLDl=VZS)hWG5|_vH|1l#8RqkJCqq7Jey6=o)RZHZIJ{@OPr52>
z;3=0ykMi9&k8?tNpFIOWOye<^*SYV>GRK5&_r{J{H8=EJ<_zb)wK)ZemJhyi1(|aK
z*wlI{X5Kxw8W1yu-gdjRM{5gLSeSwrna5J6G?!L5<FGvaxs10?TLQNC|L7_EJHh-8
z+Ct7?@V|IZ*+Bznal|NZY8&f3vjhVQz4SfChotlG%x^F#%;xHixPmo<O}K`*qFx6r
zKIVqeN~IdUvi2w>y6kpm3{}vVRZ}<+V+MLxvQ;lEW%A4RHt>wV1+11J=}Ys>Ryri*
zF|!3M@Mf%8t%3ub#Gp|N%`PxArz2*10is##vt@HSPvMiHQ@~i7_L<KW^v9XsnLW6b
zY7pAq{$B4Ziil;$F$ObsMUQKIG}>!$U3`RsNHxo$&UHRUBt6b5a!_E`e(`#pGtxz9
zB?i5l>`#o3XV0Px=w0!FPBZo!e<ks{U>m7$<!-Kb$9oP0yWWqx$dS~5L9p`^p$t^;
z7qbTWD^eSg&;qLeh+&cOut^DRyVh9c4dNsjpXLN8QRz!+0<Sm9G2&aLF&T;7#a$d$
zIMSXPS5VJ~F1!G_i7UYcnEXu#aYWRN_V7&c?2P!Bd-c*c`$wjN<LMpu{sjcZ&1hv{
zH?!Cd7Fo#TH=`X<iZHxfIg0P&j&t^YfMY}B*xI^y#Qbfy;y)?jw`mn}qC<D0mMJi=
z@Tkm6eE?P8gcFh0=KjSkgZH7H=Jg88;b7g(7ty?9UHiTze)wy}3rB^qa7CFq1o2|L
zgeN}5zk<iCP22YaHC8THl$g@+p6mr+HJeDsAR9xQRaZfq&&#cibHhl1$C$HioY5)m
zH-m)Vi}1CXN#Nu8;H__{bb`Hi>65ANIyTXrJI(kg36JOUuP3DWda@S3Ka=)Y4@9^~
zA^vXhvnUcn>wP5BO7qd&@99D}<PqLqN*M)=i9ECu?c%*uE}!D_vc(uwdCOlPK#!&P
zKNs!K{!qYkf)@MqN>3gPw|6=ZKL`78F1Uq%lQ#C>8FIh;i(Hzq)Yr_~k?&6*yx$t$
z#qP9R?wlNEjJPl9g1xnAzaqxl;_YmWtTu4EugTVj#r7x-rD&;%k3F?Pr+$?=ahx9P
zO^au;#BY0Irh!w<SuGJW0FxBaThFO+X{Vx!Ql^WG8VvGnBts#3Z8CN%PkH)9nuppd
zL#3CbIh<dt;WJ9J2+ek4djd+t#2E-mVNdtf(CK@IF}KEMY#3}uQETd7V3P8zo_Xk0
zDK4&>bvFy`G{Gws@hYV8GIa}|_)ECSUlA>-5&hvBi(Z}Ky+}Jm*PuHe5NI8}qnOzr
z-9@#_P<4n9`=LEX3-PY+q-M#eh6|RQL$!+3ek}VoX?V)TRt16+Oq3f91f?hb<mm+N
z$JN}FP$ulvS4kTLlnwA&FBzyt@5cs@Kaii0GpgMY*fM0$9e#~yomcef?ljZZ65&}Q
zbPeP33>et#8&eR^QgW~tUaEmDX>;gUS>mWMj~PJ>P!WJrZOU@wsX|J93P>daq+C2!
zWWW5Y2$X}F7UmcrMMZM!MN{#6+u0k)>LseqXldPS`=Tp};UtMx38RB^%*A6Kz5g)@
zGbhvlq7oSA)nNFY44l)^G?5wL()=^eq!O~_n0L?vDIZb1_u_~Y$!&5^-OMCSH0Y_#
z$ZjSJd>a{v)NPz?57HoDu*%ZC9pQr)t3Hh3CvRo8pnU{ScnU}BIU?1%t>|ZxY|i8h
zj{4W|k-vVqmlz{CdU^?+_CjSH!oxd46BrlV*Yac+Szb=L+}m#q$b3b$(%*32;!@zT
zW^n1NXOOof2znrN<BV@kDSkg(Qq*32?szl}Ezl&jO~ZHU??VI)Lo>9g{aKG|gsv7I
z2tPm3*+Q_k5jYL9gOHVXx2XHPg0hvmP<^~7e)=M2I%5)h#y;HahvXm`1!3d_z)tnn
z3-FsJuxTF0lK!0mf{ZuOF3;N$4i{0(Vo)(X?n_!m_L8K;s`*3ld7YJ$;6(*iYM;&=
zbkS^iR)Ci^<NZQbj6LVhu<fz^IC2;P1be*hf56d}Fs_IrTSzLVg9eyMgJPq37NjBj
zSuBDeht=VF#S;gKD5}<Pii*^Lf}6hyXJt&IUeJD*%#|s@Utk|L)$MUx=>4geJtJoU
zUpeUN9uw!+FKrOg(&l9mbVNnli!}Tf)MX+ZUzn`KYO@T`v4*J+;R0hrhI?;06PBA3
ze?_A#01m|{X~0^h16uht2YKhoS5k_oI3Ey6GkZsba;S(d-(;O_IOgKyFB$0SHKT`M
z<P;>3uqo?&cloDc>wox_y3GqvmCYs4!~0=yg>IrH*|&Ydxc1Yw=$+FyX9;=@zJ$}T
zxBG(HoP=Ce7bixke?szP?XJ?{sUm{9p)5l{A;661r4waJIv?n1c%c+e%@MutV>7JT
zs3BVS(ftl>X?9J+JgFd*-LDr54@c|ivVA=`-mcu!TMg#r+@bjC%d$uo^#9tC!wP20
zc%^*zDQW3FH8@sU?}k0RLvC{iY+L#;Jsqet0Q6LEGJf^jlay*NTxLFK3Z1N@wm@M7
zfgf76@**vD4YPQR^kM5-wUglXElbpvDhOYBTqKjlsE`*sg!xUU`XqT@<J_2xD{4D$
z*1v?Q+uO!UD@Hy~-fBbeTUkMJb3WZ+4cxS3Pk#!+97yeZ%)gdh%kFj;`_XP9+mSrp
zZ5!G7UZ(oA2(z&v(PX@7p9`}<ET(5GzRSAz{+EAYYA~&OF1YOc-Sq*xhY+Webw5py
zBd7gWYvBdoY!?1h0?WQyU$3xyE^V<BLVsmM4YUl|=~8#ffsLp02g%dIE?<y#&oS-m
zRvXr|*UX-_5tq4UD&CP$f{}dZw#rsC*#-a82uF939@-BsOFW3}U{^d38L1O$%jp`v
zUzpY+{Z2_fyA45Cd({P@ypCbw$J`imvxB}=<N~s(_HP)u2};lMvd9(=Dx+QAQ+FHR
zaxJ+LYCVT<(1LX44xXp)7w##26HBhi|At0bM@w|%BVKaR9skQe{qk_>6wColjrOuU
zxtRaA;HwRI3e!4BmFlqSLe<(ak-%*}ql6KSi&<Tgt9`tJ(PI{NSk7wd+A#U4{>~@2
zq`xv|!yrx^8F-zhQ7of2WRNvX4A6WKQ8Dz`b4dmO5sl$62<Bx>u>lNKOUy*AUq(Sd
z$q{70uW5_aMnTXr(0WDAs$P|}kbnL0U*0AoYJhUpWmL5w-|we<VB1H+os%x$l7IU9
z=n?ZVi<XEdW-tmrabR8QcV=7)rbtxF+sf%k7KC>PobG82k(nz2aAkOqd!Z{`7qL|U
zbaWP*V5lrK`?0FVBJ7F#b|59k#4h&i>@vf|5*r_rG5uE?WXs?jcolKmiMo2CKgF4e
zM|LKrzm`b+x>nNZ)O8lY1$3M##k?J{JnEIbh6ZGKX`*%cOF(~3$9cJukJSRIr;oi3
zqIwLsvMnJmjoJWJ+g(3H7rnq%u9zq7Z!z!Jb(D{73yN7A$Td7+goUs;;y8OuPVyH6
zgA#!Z6UFSQQ6DET8GNozrVAu9ao3X6_(#wC!iyS6fEK>mmcP7ROl5sj``r~jxo)!f
zCqF{vPercVHCz}1&ln*h!<pUoR);G!ZR>S442S4*9OV7cJ|DH;*Dblo$&1|#w`IsU
zCKK&{%zE`r5nQuHUYe0@Y&5U&O%9i_k*|*l_Etiz>fYwfWqQ4bQa8`Ce5O27<T`Z*
z-Z7D0a?s3pr&(|F&Adreu+3LJ@uZy}Bch6+IJ4P%Pq-Ad6$TFl;{${VVPfA6wiCH@
z@2B<RR=OXDBFTx>sUlvoDpr6b^}X=Zae|kVDX=BH%LB0fFR<5Z&rI++#uSZl!B|Rk
z1A6Ni91C{e{#5vRM$`qk8Im&2V8zeJ58PXQrss~7BxpnK732Fc8Zi7(%9Y8#Pn8-}
zN^%~XB49g$w(54Fmf+=%&&q4Oso8zMH&AptA%4K?XkPVa<|6NZ83?UO!!NxTLGE#d
zo%mLOwZrD9^N|fHw^mCjJ6JB_x^NWVcu<<c5dQFRuEw^xcj_w~O?r%7%kDg_h-yW9
zxqr!+go7+AGSDLU8*R7(XFTfyprG)P;K<j`-3yTjsF2mN<NA5PDFQ=UW&0K)jE`=H
zJWIhxw5|6$=@sHq4~*E(58e~O+ArvAz}=$BGyEPixt}|~2ciy2ULW3Zgtg@L_B1pf
z1+k#3v(b(p!qzea*sC4yqdmxLXFEqtB7~jyCN3y92o~$yb@$q_=6~U==nXam_nQX5
z4=yyRzLQ*X*oyT&?>}BCHyu+iUj?lAsYJxL1^^XUtJTiZM-V9!o6~jOJ%?(HX0Tyd
zUX2A`FYs3zE5r;03jxSyTLyl+hiYcrO?Am3D|^cdnK@UP$PFUIJK>87c6dNp2mO%g
zi8;AQW}xv(<DbR~gZx7fY>n)wR0&ZQa?Y~hy;Nwxmq&-&oiXWO?oWwcQl5rV!alT2
z8>SO#sYBD~q&xv!OnPo74e|qp@q_2zMcSCl26g1Wd;DPv&oosKsni=KT|f&DHVFr1
z?ZS2xXn9!&YrUKOA+0lSVG5_CIpaT87Fy&wb6n(svDArez`*0yLHeG7<aX<{Z99G|
zG?q2*?Cq}?tw2}y7XAZT*@{@Dn=z49D^2kE+9tDiehk@yg4{Ywmg#J#1P(c}t6)B;
z{NjqY5HwHs9~7^%fJOuwKKRPNPOI5)PYGx2?jAxG`??G*EzLQL#ut1=dq0-_8a$k!
zSAqra=}8DX?MGxa#$(j?B(%#GN#Q8u;@&SsIM4q2QlKf)f5{5y4tb>6EnXps$@qw)
zC8mj5EqesKHl3Q+h0oitKsS7Pn&r`U+{KC{&x8gsBWfq)d<;bt9f#Z6{a9zRN;X~j
znX<N69s2BUdmI^@{@sJ~mGXLLzQe6F-^fPkb&mRFt7Y+K^7zyAIl5xgj~IN_c@w5%
z0{JK!63ZU?J3OSw!I(#06*X3q@B^h=2$q33uBGLf@o5}`pSLwCDx(q|{RdSxx86^3
zW-CINI^VrHGSWqSW{#b(gBWBf1524Wx_5sO88lTrD^@5gbDmIkJvrPQ6k975!&!?6
zc6LWIaepzdX7{TNt^byWrEPb<B%~Ml#iu&wprbx-aEmSjBw^_g=SS08LPEZ7b7bm@
zVg;wcTMT6CSOP_Djp1N^x#8JajCPJK&a$I;q48+jgJQd~Beb9+TFn$S!FgDk>63Iv
zmiwzi_Nf`4q#ASwnsqV;_Y#ChQ7cs)^{Tmr{2l%taUVP<pHRi2=xb0){@DNKpsC#@
zb==KO+%5RcTrK_)06Qx?D-$ap6C0ZbJ1ajs7e6Z(11l>(E9(O^uj~IYaBwoWvGn=>
z4I+&|v;Pc8|Mv!UCrfuP6ITm>w2A^y+S<v@-ObF^#@QXn$IQ+)cK3?=4+xNzRFbF@
HHxBt9iBQMl

literal 0
HcmV?d00001

diff --git a/images/scaling-dp.png b/images/scaling-dp.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce3ae958066664dfa22565871a941656c82b44ce
GIT binary patch
literal 13419
zcmZ{LV{k6K^Y&f0cWs^O)V6zS+vcfl+qP}nHcxHaw)ua4AKwq}WRlJ9W+$1<WUkDW
zO}M<Q7(5UM2mk=!CB%gl0RRx)|N2hQ|22S90L1VA1h}byi~s;o9}D|w0P&wrXeh2I
z0|0oC0RVnM0KofyCcjevz?lI6IM)XNxY7UsOuMWO1)l#Lpns*rgaJRHv6H?3X)t!;
z8jb(}9r6Dvh+8qA%YQ~FCkYu5s6!Z7G;9b<6@Fg;fSFZ7SU}lr{i@s5O=b8Q@^gBU
z*PJA1*3))AiB2Yh@VAJF!lf>Hjc#qXJ|?b9w;ein1RBIJXc&ut#|`Ns2@)ZUg!l}l
z=S<fQ0Xa;O9S6CG@B5#f)*SMa!>mTrsf=dR8QqI)R}<L%T9ncEaqnEi<gfKO6vAWx
zK#Za<9)*D<i6JP?6tN_aLbVheU`Uwsqcgs5IIiOTTnY36kE(k0!~&xLkp~kA!4>yg
z*R$lTzjaMSIl+dRANbc*f=m%b^{$)3tG-GH1h9;QLNq1#sgmA&GsoedP$^wjxypDv
zG4TzEZ;(>m*Q%oU#fxeJhyZ2~Kw{dyv3rPbn`_ql)kt?2c;n*MEUfZkBtALD?16QX
zV&8;&#gLUv_Wh-1)<WHKrA3!&A9pub5(o-3-%FiTQBvC041_#|y9=X_98plToC}Pv
zZN1!$c6B73OP!tS9a?<Nb1`|7N2@x;<d<3na$kn0QSoZ8M(rA00gzFyGEFYkdpX%k
zWb|jZZ{xoY6Xi`4J{R07xR%Va_!hssvk*r29TSx7=pP*ULN1_vuk7}VZmL2qZc(i*
zt_j$YD$kDIS2Z@Yx~1vLTJ)_8%bmozya*;v8{yAZN9@hgoH4BxH%98+q50-bFMdal
zFvi4sD$r-@ify#y*GC`3k+80`WZxls95M6`6OW3*7v~Quy{2q>Z>>1ud&PBl;r{%$
zGe%G(iIe^O_d;I$qszZ|6WHE<jnw~aH;!81*TnQMk~G5V^<%7CIpV&7J7^e!x;HYg
z6isQYN+yh;xyR`_5zE9g-`^@Jbd%Qdns5y}bC~MkA@srcgtxl;T07vS%!`@wU?j9|
ziIgnQiVV*r_pa>%Ksas%3oUw+H#)oMYW(A3yk4P$;%lqkt)3pPyy*l$1^dQ038Zy%
zRwu=W#&8I$0-<ie={!A+D<|3d;?^c=7=V75kqEVyUScFJlal40)_PAn<QF|Vc2OL6
z-{oYfh<8L*2Q#cAEO&?`>3lV2N+;Vp?A$Y1SxTkf>4q!pEv1){T`qO;etlrpm3g=<
zCQhIk2Ny(;FFL#{@Tb@?jcm_xKekxE##%Tetsi#MhR4!2_ob$1_rfW%ij+1N;1;0n
z4L>v4&JNJ5$PVsM+F^6?`bH0J(<CyR-bk;xQ?{D%G*Gppu(Jr(Otu|zVz6f_^-Z-3
z*-uoubJ!#XeJ0#*naG-LhOt>?uf(j`J#{9m868hwKU$`6>!Vpc=F?6E4l8$19>z9X
z!m4^&3tGnHJ6C?vu>hjbW6YV)1Uuf^NOrA5=<=dQ<k=i4^uH#rV)qpEjx=ZxAix1`
zL=b@2h(S32PTW!6c3*_!$r&5n6V#}vYQ}d5>G8|}%<cNqrc3-SLkJpB{`EwHQlD!U
z>|g#=O6Pwfypi>++>mtEeZax^(*!RnMQ}UiK?atGN-lA`%Rqlsu}wRhNY|+Fj(y>F
zq%tt{K|TsTY%1^GPL5`7)?v0Z*k(V}dLs@*VrH<~aDn0Q)nLD99(<<T@gsq=(C`zD
z-}Q=NRPum5_DK%O`Wni(LBd&^Wr^~7@Ipd?fwVGyXQYn#KKX76)(j=H_x89Mz<);!
z!}_*8BU1oo6_`(qowsSMV3W!+S-xYgG7p#G;5y$Hhz$=)`PF9AdRKP!$6~~Z-H~46
zAM9WyzJp55$3A#V&!Thx6;dzPaERFSMwU;rX^UFiJ6@JD){Kr(ia+e>+?yZ1)t+Fl
z3Qvz#Ou0o{ttOLo5bVhKg0~y#)*ws$>v#Ta#`KZvo?joa|1$>f7D6zY9s5M`vJmIg
zrmr60b1tB}oxtPwCU)cVc~p<peb3^OVD;(&Q8>6W*&XtBVaRAYaV%%4oh=gjGt9Sq
zqsxrO&TK!^ZM~Us)8FicC^7G8bjys;-cCi9`}h9Zcdsyz*JpoykgYYLcH|d**#j0{
z0$b_+NYJ7GpI0#6Jy-Zjy#3_mp1HMAJE8F&%b1#x$agnllGo8!58G4kK-o_E>XaUc
z<&y0B+(wxd9VoIcvCzdV1VGwE_Hya372(7ty}=}}=Sz&-JFQq$bYX=3n`AVlEUy^|
z!NcukSTwYMxu&c<tR>_5xXY8(`?y|9;XhQRbzj~wc}HD26PcZd-G<DreQC}+Lw#C5
z_Qbuim9CE>Sez<i#?k-)Ry$eGK}Y~qEa-*rxnlDaH{SJ2&puoSX!wsg1*D7jv<qK{
z4J?$7HA&xEdOg&Q)z^aL9P{T&h>aGH?g}$8(S89g1t6kTHdv{wwBqkX7jGqgWvTP3
zn$`i+$5zr80{12MPE$bFmF@AiBNKrbOx~fX?_ZTgzeXr@Z-oRG^pk$g>_1g)OQbLM
z#0o*^%Ib{M&HwaPe+>T-k&e9fQdV>qRK3<ldrstP3CKYE&4U6auz=!%*tBINc!2qx
zU!n2|8Gg#5u!8FHF-Yqu@-cu&FS=-C!X%`O1VJegF=&WF(qbp*i8PdKq@)zKI0mFV
z(sBj373ffaK1?xXbAsP&NtuiJ;SQo>86g5@F)bD2`cEtelu!LMXV)P^B?VRk)qFmH
zAHy=96pBhkvh=5s%2w`QZw1HUeQMIZ7tyL7R+>iK{S74L*V^y(?>DY!IGzIBSHwCd
zjfJ;Lyj!SYME=~?_P<)BPV)&d$Vua4z)i#8&Sq%&^`X33X#SP7J?vC>1dP*pdD%YH
z2j6&ONlF&lchqRbWa=Yv7#wSf4HPy&U${%Zn!W<17>rpi)s{o0RI!zv7^olpB(Cl^
z`^OlmKkQ9v_Sa7nrPcOl)(vc-Jk}GR>kh>5iEuwHwHelC{0OxX)^5067W9v+Ir$t7
zf*-fy$8<^faDguL9|8u4nG$4KTk?~G(F^v(Ba~znp^z2(>C76lH_>BM#%wDAEo=Li
z+i;X(PF(!=3f(DjEoJR;|7u>3Z`Bju3~2NAE2`YITir_dmF3ezYUA*DG0d$rs+dV%
zH@oSqOdi=S26$rfD~sr}4NoHbyXu~RK_!$<SnnGDWTxdfL_^kew$PZF*B>jxU)9lu
zV=mlg9}lElPF7cF{<;WfalyptNyjs)Ql{1j1nr@`HDlDgi>sR4iy<*YSH_R_&pL}D
z7FMHJVrnDre%D9ioQK)gW2AZtnCUn2i7|{u6EY;2Ui@qv$F}{%sfmZIa&cCz5k`xH
zkJu@B!^yZ4;0d<rOehppAV(;o?Li7DNRpEI;HS#|#VlrPE%W0Ek~o0b@#<v%ia@ev
z>HtyrB#7{26#&(hil(|Em3Vw<km$Wj+oO!0JdO&$Sf7$8>GL76AT7oZ4yuElzy3{9
z>W~^FP#lqI6D%{2VH|cq{a>zZu;Ia4YoZ@C`#pmVoAz(9WbnsiU=+oGBL3@ZjrqvP
zdU>qto72${Tddf2^FyD!J;ztc`M`RPQ_)(os1I1@$Za@8>(}BL4za<aHvdji_w8xb
zbv5;WSH(>w6KzH7xYzB-unUZtE@sP_2})f1_jjB#ITNR{l{bi+%p~Rtw}D#jKlvwp
zI-w&iMbJyHe!u5HH2eF6UvdpEGt^{g+OA4tR`55bCvhuL$S@=dF8I|&0kaOpos?*^
zbXLoa42)s@h=@e?LeaxIHq_I^)a)EZUz5bEagA|ZPlhGz8B^ZQruGsWtXT~HC3hQ6
zu{q;%A#q;_!-m&cqi3^wNv{7)OPCJWVt+E|xH(q6E^`kXVrUMtd=8Ir-S7I6;xE5!
zc{x+O-c>Oiyy-)0eMET~B4I6Uj*E!U1y+5Py}ByFy0cXOeiK_g(A?mP5zyn`S-oVF
zc+(;-VBL$xO9^GB#*lUPG&?U@qW;@JxO+j2xS~pQgYvGYY;rqT@3Pw+XIQi{&e!R!
zDZ6<bwu2@6OY-|VJ4&sFf+9+Dp=Ixgy-P+Uc!dt4CNqGFiDxzpjR?sf6S94)o_f|z
zc<asWDrAxwe1K$Nwh|0>ryUOK4x#3B4OwQIfHjdO)d6aaW6yRcQYdoD7m+|WIo^=}
z^a3CMA`lWpzNoO<4y0+*a~-rwfzM_$5|Xj7gte=h<Al4l7$FFHzF%`NefIQOop1)w
z5)xAbTU)#bN!aa|n4axwH9G%t&4o^u2N4!%4n&eDm%Ml|Cj|iwNhfcicQ*%h;@~M`
zWiIXj;a|Ny)aq;mDI46b@<6@?W|>;e3WhoKw*s2_A%YqF?v0bvY>90=OjzMzT536a
zAV}a~pe__c(tpK`(P6=!Vmxsz&w~<dKiH3|P*jX^iB%rShF8GqsUuxiYv4gX;1FCR
za6{gzghs##IRyzWnL%$r9XgjUR~zf?N=K={qac~7kP}dxDypfeoGjqL_nFpJpVf(a
z`yE7AV+Elj=&V(!YHH$*4&3)F{hjN{z#Y3n#m7fb4i74qPglKGTCzwo54Ts#uSmO6
zkfae7>kE3i%e<+8TuDdPx>!LvGpI*a2!r$w3!{P_{I=Oz=V8i3NrZG_VWI+!QKWEt
z2_AjsVQn~FOi>mMbKQ|%ZFrk-waXVyF0h+loMg}|Ol9^XqkeF`NsBGCZ2h}XGpOD<
zVtF`HrK8s!Ea$79)5F32cB&2JH+!3M#SY!^z}rx(;f-BSO0G;?8*b!ez3fhgWllN%
z{d59)rp%qLlGZ){$4#q&+vMhk+OWhsz5Aq*z1~;{h3$&MPQ|I&y!r6oV*3nVM5A|d
z9XVpKn{jy;{|~7H%JS7lPI&Awt(;y~wwF<je@#l(SyA=$MxCuk0DM@0-m^7N<4wN|
z{M|)9V$%2DBX&F6^*6^D6#itn<Lo<l>Sfgc1T0RnzmRciFZmL@Qu3gpSQKXFM(LGN
z`@2+g<>bL9(%!@?MywMX+`qqtpK@@S*I9H;ThAJLx7lo8-dm{cm9#TUdb3-^luH3k
z>yFy3w(*T*#2!W*A{t$(17921%D7mKSK_T09Q7rl3(KB}CeBw;#M~9S4W*jwH1;pC
zx(Um>l2%fV9W0aDi3_I%OF6RH)JAdc*RQtLJl#IYA$f5O1L7b%)(aI*_}u;Eq|gvp
zf=L8A6fu{N&`ZxKBsLM23r_AGe+sw|F2lQxv<`44f=i!|6h!g2!_ro}iXuO_F?buJ
zI);N2E2C`Ip!~Kg<_q7GrN>i;9`zQ<tm*Z9!a|0FA__Ea8xntw{QQm3r9vAe(9Uc;
z%1%h#7tb#UcaxdV`qPwlWe{<}SiGdH+&wtlEs=vGwX-G!_ZIuvNGfv&?%O^&lw?Nv
zmPgGSKngSDo}7i}S*i`QVa#B~4rDiJIhU?yT)OH;?LGQuP{X7nEH5MpEy#}rRKy0F
zQv~NHiDHRE7(-LCD1bXs2#G_;*yr@Cp$bX}QKG>YF+nN<g~W+TSc+!R#%n`{lU4G>
z{@z|8%2lb_f@)9P7j!I+_`9)k02d87_2AFjOz&1d!4!m=x!n3{8m}v4jiC$$grJFL
z81>$*kwTEheS?sS<mFi+L?vhIp!t^r_Aj4wFR!F|#50RzT(U><b003~+n-IgA6WQ&
zUN?DPPX4`_k@#w}V1h;OknnW|FmvBuNVVl5n%lTK9u7(qFrJF5=GL~k_NBMh@YwHm
zrS9bZCM}gkmoe1f|CyGD;oveh&HcIj{)UOlNU#0wz;0JOD1bSp=GQlScTJUZgQ)$)
z)Dg`)HdB^)1{oNUQc-vaF2h*LAQ-X0etG>)V|$;G`4|_U+a=6ejcnBCw`VZ!Lrv_y
zHj{1&J@U3}@C|P`5#1AH<B;s*hn6AYClU}MBAIb~OQA=UkW`eUozY5-FStyWCUR=e
z5Ba@S;r{hROmL;Dj)^HVP}d|^*{02aCce^FJ^`==@94UEtFcva+@`qe`dg+CZajyS
z4<6+oH-J^v(1MvseS66sY!DOpwR8`;x!DNwHf2dEwNtI~kTS=Pu!va}H-dsMPMJqT
zTAY9o)R0spuWFYTZ!Gb@Qa#PRU~m|3Z9G5~LCmepc>%Xf>%XDBPzPa6h>2+Xi;s|p
zTPK)WSQsQ8{P%$T(cBLAByg?Ok2raTJh(_cC1D%|p~#q0z+B_VioU0r(tB&8Rk}7p
zr+lU6-bGk<qy;gSXQ$J&R4d1QtNYMFtq({!6vZ5UwLQV*tl`#Rk;UVqfje}=fx1z1
zaaL@86{=n5E05-*S!^z<Yj{&bO{cTmHSDgB6W-pM+!=)<Q?*{;RkG{RriLor+w^pM
z!h2uUW^z%@QN*TN?M_I|WXv9y1t1Y6b}emYc20vsM|G$&++m4|vV8waG`~=?5Fvg$
z=L)urx$yX^uw6xQ5qV%bUmW^-;CzE)`|D4Vr?>h-n)Rqy&gE>2<o+}Dg<kuco8Zp5
zzcmt8^2@x)mG9>$UUcT4)OTU0OT6^20`0=@H6f8w+BZ{<v4<-C)ibA;2i{oe?&6!K
zlM<G1Is>!TTmKd$lAybV_y542)7nl-3=l*}1Q5bfi**f2QKFl#Jn?Q>eyVD7@%<}!
zbIPkFr*`FH(KmxHb~9;lq%nN1@lS;-)Z3fO@NEBLW`$|c5uO2Uq<48x)8_(gaAQ~-
zl@EdT#>z>HmAOOpWQ2gmeXHSo+RRbojp03un^S2fDV&5l##1VC(9|;yVa*5PZOE|c
zjZ@Kly?EZ>Jmp~&k~BW>I0yc4!&!XVKvQ>aCWPhtROy4SWLxQ1KZmLH{e3b4HJnug
zQ$*lV1mnv<rTBfvAPBChbozXVJ8SCv^W8<y<K6nq>21Dv=*_0Y=pL{%tHIIquDJq?
z7`zFoh$Sv4C<WTfu&{xAD(I>~rxiDE7Q$MjoC3`QNF<*ILv5>7v_Q+_2kE~a?`I7L
zf!Ad|RkvZ`%Frrwh_9xF<^0!o&;^iJy=I-S7~?!GQ?<pyS#8KMJpD_p+XU-*ko8E+
zGTP$1M4TT-5{t;Jh(TFUNr1RvCNyy1S(uG0SD~{Y^OqA{h@m@M$9vk9vn8H&Az+jh
zTPR9IyBQ`(OQ^C82N-N4|C_sW<juw!kjF0$LP1%CA_+qYLMpsF_fH@8LU16s6VgJQ
zzRNt3FE$?qGzf{~PC|?42q6d(xNNhR>l$rw()~tj)$5hv`m?q3cH9O2mb>!q^HQF>
z=6?RS@1yo^^md5+!@+h8li<YV+l*4)+YrJLR5D%>O@lreNfKx-kxv6+3}c|-Rx)#z
z$XXh##&eB)vEf~IU^`v!5@VC(S(~+CepiXQ%m3FH4*)t>%g^toPw;Av5dl4n4k6*o
z;n2bfC3RNUCzP8Si=grLb<%k{yT#*=fa-hHH4U+utJ%-$CaY->H6^hmad41v^3T3D
z4xa7qmQ}jiHN)K5i)Hb$D+%A`LNcZ|vde&++gr|t1opj1jfxeT;~KkRPxihBckM)|
zVT%dRp2w6A9)6>#v?Ni=125)%)Yh?AJTJ6%TU_g*l=<lK(>^6?fRF;o6LZGrnazzG
z!Oyoj^(es4$>1?#T_&eX2r7yo!ZS{uB94DDc`UXfvMl43)mX)?e^BJo5hOe+-mevV
zc6IkaA@=Fxm+T^GkK`IHyx)Ufdix%$Keg_d#)@zFOGt`@3{E~3#IKd334NpG^O@l$
zuG=U4LtC1Gew20Pa>?&H&*1X415`7;SZ;(Cuq1yqT#>RzBe%T?p-;~{xvm~H9VDov
z@tL|*KbE+G5TT|jrm~&7($6+p2!I4XryrgPJI9}|$H=;QHMR1rJ;m{dQU-ZJ5Emn>
z>KLKU-ql4gC{X|kJArH+kBXRBR@H&O!|sA8$Ufkd@WOXb7a)a51C1y|#>4LeLPJXA
zwA#mx6s}>-@N+Hd`u(vBVe<D>Pym4h<`B#VB$;^6q0lyVr*fEX3esS92R6VaZx$Af
zigR(n*M=?5RiWyQfTTtB*=9HLgV6@!{EA2O&jz9n)z(S(T^MvAu>sKvSTPBS&1w%j
zjg^$9b~~>!%8ROQP)yg|gGf1)^c3AwuWn+sW{3iRn58SNvoC8I6cfroi%NG(fI%<G
zH3UThMin$SEpB;{ExWVSCsFIc{*;Q=)TY8yf#o*i9W9Q6-~HpY|M~0WU^Q$H{bjeI
zfzLsq%Ro*2QoYK{f-f(t<ZI7qQvH7{jmK>Vp6Tk{$!pT}9-7}`_u%%Lvx!<~88uVM
z&3KdrB}l&I^M~AbNp`IIkzo~|mnHF8+=d$7@3X^rchZ+Au-;Pn#L0>BTMR`Y<HJ!x
z@LSP_lpgONeD*&%g2$XjE&}mAO`YPcsSx-$a=csk=ge%~US427PHZsnf3BFef>}sl
z8~%B9MA9~=cm7qvm*9?_qz-AgYR-J)BmP+^bMUgYnDC7)1g8dOPc$q@g^G*hLMse_
z<mz6XJI2ld?W(Nx<d*XXnJ}&5#Ld4i0A?!Eg7i|lD~ef`^{Qoc@yo%x8HB?jNX8iW
zFrk8GfRplpKCB==sB>;zi<NQS6Eq8cF3TLKpCYy-CYzOq(Jne=qaHixaHhd3HuUY0
z?}29YK353W@8ISiy|Ct6`by2w>KC>d;NCz<lI)s>itjRh!Ym*AL|=Q)+`Sx@qj0Q4
zvrylw0Ts4I-$CwI9x@6b-0zJ=hqA=q`<pmo-By@Uk_@IUY8)V2@T1;f#Fd5yI-*kC
zM{e*ti2N1xp!T?G1SQx%^K}IQ!4dd#r>5N5lM2|Sjr!P8GfwvZK?im%JuNrgGVH+<
z=nH~mQbNcstuz7I*=%4ojySRA$Y!f2V5n(=Zg?~}(Ho3a;DaY`&1k)l$0V$U;?tIh
zCSfWTu%!s9)@S{b6XhF`qXFm%(@xNa{a`;nr9zmaU$y1-{pO(PUG2>*$|Ln5&QUH$
zq-^2M==_uj=(=e5;*m6@f~53qG{_}SB_HdHJ*Q7vMR5X=^1+$5@AOeeI<CiGo=q2z
znWHV;A^fC-B-52Ndm928NaweyLf(15bZ$x%M*E%bV1(GUK$0;oT4Z<PTI^DcNo%Lv
zE<lKil+}Cb?N4zS8!c9RHeLvPeWayEfei+f6OdM5OMB7jvAt9}z0sqc+}d%95ie}n
zUT;b0eG(L?GN$-ub=kZ%r!q{iy+JGxdU4K!Uyt|YS4Nw-S#DuohOz(%f3kgeX10|-
z0#i}wPmbms_Tv{~|F*>UULggHku$t=NTIGxW<1jaNS#<-h|^zIc6DZ$tdV2+kewFb
zW=L8;gL&C}GPUf+hOBn`K}hhXb7gD1Y2pgz@wDnZGCAN0<-jQ5QW`W!_Leem0llB#
zo4lAGy%4|8b?jo={~}k-(T?z^;pqH^4=P;&lVgR-@cM0EA;+Mq&=ISRC_FvjSQuy0
z9EfE0S{|I}%@DwW9v%Jz*PqfB(##6JeG#6hqFEi$nN*EX8$!fn3hJ`JZ(Q9_5d;aS
zWLW5a3BL&qO+vyxf&gz>>qB%s*xQz>tz5Yo2pW2j@P{On*9WvAU}6+)qpLZ4^=q$0
z?dnsUNkAQvMfKH7?655Yqp;N&VjUU&9z|qX@?C8Gex-EFPlkuqHQnda15ZrULT}fi
zX9gPXl;2gkVS%6}vo@!3obe*%vJ=7+-HCnX(dg`@Y2LU$W@J2sA=Lk<8NmQ5Ai-?z
zHo<D?d+O4mMN1j>s@3{iX>ocE1EUMSWbcPDSyJs}9P~2VuC9l3t)SOAe2v5s(`I>4
zvaj4OCnG_FV$wE0>S_J2DA~2On}T_#dy`0r^<ct_j2_}YP77m0YncxtCtg8%Y7SSW
z20xES&Eb?Xj6d=diY!XS9kfGXWvVfvyCq&|)SdoOB=CgmnQ@1Z+*nE_be%lb$7?q-
ztJ8=#j(YVL<JKp`5~y=Fjw^d~W&Hrzg?CWIrPj7BBYqpe+wzJ|!J3~w=89ZNzU_O)
z=Opv2dj5QxCtr6l3;_J%L;fy{1pRAf-?`rP@9qpdbDlT9*GM#It{uvrQ0V>eHK43N
zM!-e<wTghqd#+(=%*VosZS}$mFhJjKg8gIZ8@zYJ(v`og9~8Qm%57&p9`gveNE}ay
zAi0D_GQioJyK}TbJljt}@{=deY3sJT+ix=pDwOyc=SmH`QvnI5>M;W`4eP6kiSFGp
zG|-rm?ay+w?q;Yqw0Qp0zHs2DV_5&d{MoS;d}N=>us^%)xeGL4WTxfV<DYMJj`#6^
z8^O2Xto5Y+WJ81)^*`<R$4E#CS)fed)5u(_+j}qnCnH;v{W>IsIrb)*`b78sEF3+u
zC-+r%I7koa8vV-n-uG;ee4=KDkfp1e&U2qjs}X%3_WE-<X|8;*FSNGCoMSytP4lx#
zuEz!kY0TiDVQ~;kz63{5*Lp6sl`|3*d)>>z!~RxCkF&t@U`10<61}C;>LQOxF0%f(
zBg<|k-+8e;W;(hVQfp~!mHvB3>0_iR9XF*%-{=7FoFo(B=va9YzD}uX)WYB#pkZtu
z!%;r>yEP}7ycrEyvFI9jLl0WyeX<79EZzV-a2T*4T+pJWrb6&Ywv|saY-c}wEXUJk
z5vqR>&gH1CBd2_3;v>9Ri(BffxmtI&N+Le0c&*tAoRWZ@R;FKc_OB+taAim7<xYhq
z#)};2tgkbD;_p|$eo3jBeItGt6HQsb4Dh7xTW)4TIkCEQ5cmNV9T-#OZZQ*4_;fDY
z*BjV16%9`ygIrW*yN0k|KFRk;CB-2mG#4I=)6x)%NP@wVggEnsRSeAiq=Nr?)u~4j
zkmScf9<bi9jbuOrf@-44sEBKa>1#~~7918c@p87-y&Afin;SPQYRM~hgi*Y$?6?bR
z#%OzBCv$yl1z%GBaD!kOMlt4>f|EufxKCRe2ZL-2D#%wLDX=6O3X|5UtLh8#(>7#*
z{&b*G7NZeO8^jB80HO|{XM2+s$7m@OjzgKwa&sAwMv}aM%Q#qmBb0Q))<RhQ)|#O6
zvWV-wi#!*&&>8#Hl4VzYVeXtv-KM$n$9*wioi4UgaMltK!5mrE{&6YEY7|XFcjJXm
zXh2=RfdWJZCVBx$FoU)J90y>00;vEI?L|~BAw?1_c7j}y+}0WGST8H>YNA}eFcJUW
z7I~6CZBdd4JH$~#BUp2NlV|jpcHLnL={)NfPBQ(q2T{>Cb)Ca-BP!7|tM_T{i)GoL
z1vQ^EO^F}HqEEKp=%QKbEL4aw+FZ;>XNB`5Q*ETDZTY*^JI6l2`m#CZ)}DP(eY>qb
z%uwr*R(G|Q%>NiUuq$rx$dX9rjghKQ;sy+wZs+_VKM!k7Hg8R{fd%Kw#6l&o>|(`M
zw(%_USasN1p1Hq=B?x_^*0q;-JVqvmn5mzrO9eSR3ro1{oS>mM>Pp2vn^PSL{n^ho
zt{@C=xP$sf-9JKyXCEmz@jYPu5S5}bxT|<GpG-WpD`zpNpRI2htcki^XcHItfv)P$
zvGm{sSZfZ0eELVe-}<lf`K1=@S2k)tM>x=gr7BOCh`3NulafN1q4n8TpBN`kc|!Ci
zp@1_pmrwFHMN1nFnfXC2^7+&~gfIYvpA*a4J65!bPo<T^y^N9M6^VoEDB-^nw$Vv4
zzVsPVk@d07BNFRrH~){+JO8PE_Nh4UlSGMCaW>zdMGGc-B0L<jYeKNX`{L<5;fcVf
zxb#i16hmObO|DZSOwSDr%>xK^r(a}c?)iLjZ?m53?S4(N@htQqCn(aG3*gZF^?W(f
zHei#G4UU4!_hQL1Bt3q2^L8On6AMN?Jd$}}WcPd&;j;I-G-S&FBC`pmx(ie`QOG$i
zaj{(YkCyj08VSHsYrXtk2&|m)qGuT=R&RsAKx3j?g0&+U1pvDYvOF~7M}1$BDCBE(
zW&iGU!`-e{eD~-WDE*@vU_dm~9Z{(3`&PK*7lqsKb{;*4R_A8leW$e@<8E{0%*p55
z?r1p^Bk+M4fMivCcX%t{q@C*==|TfYImPu68KL!0*kI~jo6@r(Zr@c*z<)CrceG}^
zs*HJOx9E8C>EI<5gM+n$fsB2*e`0*ByZGzi{Xqx?6?#Lz>#Z)J_%M%nx)4Iry7Cf#
z82I5(`%KG+MN_Hs9}salIi5`wv2}6zhpZz$_^cCycddl4Hhf#^(KUhT>AtSU3ZB`F
zFv-c8=j7L9mmsy^j)24|{i}iH|M;halfi&&cT}_t?i3dep|YHVoW|vj;!&Fr*SVXp
zzp>{hDG`F{h}-m0-dMS$0M5CH#NrpTTJMmONI#s>WFjgr$@2n0atRYS=pI@B8mxSM
zBK)fj9r-W&4@*#&U&74x^6xvfq9%x#*#0oVygDu8Y$@gBC4qvD3OYLx(8K!G8ia~+
z{YDLMoW9$do&Z(;Wf^hZ=%ROi#gk)MxA%E+BEvv}*E0R#fimx%o`~TOe?wls{ap-_
z7ElC4On-;$d9@;&jXa@?)h{vIs!ofC0p6ec;5v$-<A>8_wsQ!&FOGxG3t*a)`{#Lm
z4gd%$H<atfz27d(Wd8T{V)Wn1@dy5Q$h5zwA+H?lIN8)skE_>>Zvuy_>T*)&KeY>{
zkEuT}EA&NEP%d%YwZ|p2{W6f)Cet-O6ZnHMPh2hGYckvGSus<{LRde~cQS*B;PZ-P
zm#(>Xa03~nXsY&F*BLSaa7>YhX`?At^6<E^i#Wh)*-w1daiE<kEV#I07MBWbsR=4c
zZp>8q24FR9wj(oM?7qM;V;OF~Z0kcusF2c!d#Q>(F4{u;BRkNNYx()=g+p<A$^!RL
z87Q(mw>og!l4Df8XK_5x^9UjuwlJr_(Z<T8j|yg59a914WJ1Cp1{;vyzuPU+X#uvM
zCDReeTmfzSJlKOK@;I=s(@WX4Cs0nQm!B&wqX5&j!nZ2LcTXxP-6jqYL?fz7fb73w
zlaELH(^m5O#(H}}^NkiIE+oz;ezC(LQfG!}3*=AB>yAtaJ^e4Sj6vCta40xaeK}Ah
z&a4=qr+|a;W$0!b-!RW5)KH1GO+C#>K~dBKszowgdMC*C=_?WC3zaC@x0WvjvC@kS
z8UmdC&2pkdZ3a6Wss4#+dNkZiA5NEp$AFS2Fw2Gw#!q3Uuo)^Y07Fca7hEscZi{Q5
zG7kT~LS1O45DFxsq|<~U1|dwNy_kNgsRO{GJ=PYt@3#4ZDCFhIpKJ<HD9<XT4=3uI
z=IRR&5yCCq+~IvtjooB8!)|vJh$)N9_o2}P|ISTO_L)<i=vKF=TE1;WROPpHAI*gj
zwT1rnH6UO$l@=m?$vU$QTJ3IImO=LI>xJ!DVV!SuTJiG%G-=M0YZ}o@;`rFhQ$>G1
zspDyd?Zs2y4}`z~<qxa%zwkL;cifd$o{*uAX%IjZ-nvboi2aSHG!y1&{&jJM1nl1q
znCKo|t#J7HP7KfXENhL`YAp<&Sb7MGQ?tAj`a~kKF8G>1fp+uM{W<7{Gza;6-Df_0
z{Zf!<{uo01&-<pprLDL*7UCiX&y1Cl;@8k<r+)ivZZ7=G_bmmhx!Vlcw<i;UXNp*_
zqy%0kpE}PlgMrH>{o~zk(<yvYy2HoZ&PH}g{Tnfje;F#sy83c=#>2sLLa(kQeYl+9
zfT!!XP1kqIAGuIWS~6~4O?UCqx+VMj6NE#7U-^4O7;HN@g43}SaOYW`<R``tg+`*Z
zM4r`!F0Tjhhs$JHw>)yRhh2!Lu!50#zn(SuLqoEc7bYyX4Mx0!SrB?RFOK&`{JoTq
z1s5C~6IE^cAO~)Zx2Od(-(n^uAt`o|Acc7N6AvX3;TdrFH)7h$%!>u8F7<*>x3SDl
zCxAd``<j@TFWL?Rho@)2@c>p;OrZJEFZ|q??rKMjZA7)%{6PM>2YeRflpc@QjI8o_
zG@$o}Sk@XB4HB8|vA+@(wX<>yxb|7)Qw6>25<+;X;ZMj=IDr}wI3WWzN3)%hF(s<c
zT(#+jTc?KYWps6XEr-4zyVw5gd;jJp))IK|MXtzZ_!oR!E)ZJyP*5#;;?()>V@vDL
zBgJ!56dbP;$BB8!ojW&uZuZMBh*`zT^g^%{>c$Ti40!)2aOvd8Oy0*m6&TgJMle6&
zQs1XGMpKKs@1~eQPrkFe(d7?G5AC~&<$>1f$g2dw&Oyv08tC%~l)5{aq(|W$RsyYk
z;^~&zf$rAluNL!AQxw%UYUgv}OPgkVH`nMzKh^L|dLRdp<~chprjZfB?Uhs@D)t7A
zO^_Rnx-c4`o)*uH4^`x28`kn(<v_vx1_c}|{^#4Z`A%Tz(d}ki8aw^*KoH$1xAE57
zH)t>NS}*9S48p>V6#Nflk0#{=zDACh^FwrhYfTt|5GWgf12-Q&Y=#S*j$F#uY;I3H
zg=$PV2_kxS)^7h8+iw945*gV~fv=@&rIk78K}q!Y0I>lHuUGHdcGbi_%6(N8P5i@^
z>73S|{&Zp>0;_*t`o>Mq>*SPOaA7*)vhK@poZG_=#}bqZj{qtj(zS1nq#KDmJq;+z
z9CRXh3k!6Acfb|h&B}$-mCLFtT+7W-b)p~+*9`$EjVClpC_>*DC2CBs=R(VP=gaLJ
zSlnsnNu?TrV+Z`{+OhISim6A5jL-GbSY8CKl`P>e?GL&h{59-_kAspzPIYgYpwagd
zbJL;gVARj`<N)G3K)gK*e$R31XOQ=l&5)rh?1Zi-$Zv7;_a=tlo~qG_Q!du#+1`I;
z#y^+yFZC(8W~4N|CXfzi=2aTj+fu|xO4ivI`F>!4;<*@}K4v)>OSh*!@p1F}7UM&D
z=LBcYw{0)#lC2W~xOl?r-@cl0&dmE;Gg$UKfBBVTlSAdYP8(26dqn5Fd<uuI6$Au?
z(a22}AJbBp3|s=Wmr%fZXEZ-7gnpT=VVnG09Rcy&g~0zp*Hny3(p1G(Zkc~P14`<t
z8?O~oQUHOL`C&&FSv82nO44A&!qXws&Q!^Yio43*d<>36kdDD=fyfa!inC>L6_`eU
ze!lVkdmpq0$I1v%tY#3E+=G&#d1VLW5hJnmt05sFbx5LD6Gqn;)t}=MX&{OY4C#jN
z=AJ4wxmK|$0tJ>s<7YW$5J>#DsEv{1@PF$<Li2$t1u<0eiQlLIPkz5Q91=wdSONeP
zKg`TuGg?L9;CoOXLVgd2?Xfm!0fHENZkUR9C;Qz$3)xve;Dr${Cjz8ga>W_gVZ3<z
zGCx*kZqHY-@@3TfGEhH@*^QwY^T4l-%}~i)x+y5W)L1+e?{)q@wAxd|U?0s%M+~2w
zMhcH}tKwX7m7-WAhG7$QOT>yUq(?pyF^rS=01}GqCte6~*V{vN3*dY7?TrpeL@8Gj
ztzf<!ZaJz0>5I*3yYoiKtNiK0!2K%V=xk6CCv0dig^)j9P2vV9<p~Je{15Ms37m`a
zrTJ*+A6Jw9dTt^K_CfEs)QSKYY>2>vd?r8UN&>Q&TeDz-d9&&DiyC)SAw`c!(D|%X
z+JVzS{JAN2w%2LT-C=J&(kiHqKZd_o@BGEn7&B(~Geyi}2~t;EOh&9rA)>>4fyrx-
zk^4gh`R2169e)>2RRllp&uZ9R#TiZa&PEdh?rQ@}l@~moLZDx2wXV~tpaTz_aTjgX
zJ>P!6^hIJ;Z{6J%Uj>xNnOb4h<G1iUhTKFG6chWt8^4XV0QPLy@AcSCz-4kn`l?C`
z>}Q6nLu}39z1EzRe+Y@N{q4Hxqo~Z61%G+$o)G+LJs8(YF%LK0O#bC<{3?s7tea~5
zCkFnm*v1FOTuxRlrw3Y$Ad5h{jY{EZFRX2+2HAF7?;_$j``yBSQlZ-0WNX*Hd9KFJ
z%;bqYaOW#o<%NTg?E53Z#M@&fz2kC}I8|FKl;37%{Tb8wzZEBTIYGT`8hoz?X1J%}
zvSihyFD`Rr-6FNuLK5_bX`hUfxk|UacTv%a=km&W=&etuW;)s;hQ7t#u6_IqbPm|4
zVZR`smo_zc{pGOBXJ$z!OOAQ9S9Q|<!oJ<gO!DrO`*ye;^H@JYZq=i7Vbzevix(%Z
z@(yRgTJaDG@7WqOaY8HZW;dRObh^+EDNMB2RN3=z<!jko_FwMogzN_Ok}FZ%34rVp
z;#!xL;L1|eH_e;<p7nBh2_^8afJ)~&>fJVyFqtTb);9<dXA(!7C032Qlj*=|do?{m
zIPVpz-wKpJemJEBxe{vVC=QHzrfl(I8=2dC(@;<%{T@r6!-|J2n<bj0It>WO5g-XD
z6ZbaEerw(4EmS-x1;dZk=&ALw+&B8Ny>+cX!4ICT#es~=ll@C&W_NhHRX}jIYz99d
zCZo;4X=L9#)5ZDb*MBEvhRUu0D3konEm|E;hNNmNKeSur3od>+(egb<p1mg%JD~P)
zzjx5v^BoAw+gY82#FB@E@Qa7d8gT6D7w1djnBUY9Rk@XMmgvKowu3(47w1w}eA42F
zq}y)2V}WJHkH@thHon+R>9x;ns*9WJP1s~CMdD`{3p+LYHb%25e5EzW_#DU&PxF+B
zd{KZC8O-hr&>GAvwfX!V1TG?k8Wu#&SP5CRa)XtU-b(zoQe6%!t8#r@l20VZlM(Hi
zs7iYlxjg#V(#FshW4a8S#!hjS@D<1NF}i*kOAzeupM<m{w4St@yHCY8OUJ_T#@Pe7
zu-5GbiqZ{`;1kdzr3CG^PS`0DGFCxUNCu5&<@}*Cn?b6t)5fZY<(O??)V2M?VCgbx
z85p9Y>1t%xNtU#rhHl?Ck65`!NGCG`Ztpxa+uhf1A``gInjk)f!9Znyzo6cM3^PF(
z4-JjAQoa<+FAA9}uP89W?~1@@L1b+f8ObAGf!k6mDvGJULjk^8R&RC4aCZ^3!&m3J
zwQBeFPT;{;9{3-7kMCA@{y{!wnqnNIHJxiOF>zjK_HcOhgppk<t)DYkFEJ;FXR-~|
zORoC{qC}wU<Fsvc-%e^eLXQ~pX!QVPw6GV~hEV+*<L*dwvFeM^o8@zb*WqSKyO&n|
z-RWJedE~n8ZII*!iazc^EZ6;n)M1eU-~HkC+%g7IF1MubxF{pgZZVunep}O7aLhaw
zQ+cXF3w714%2LV*gDaqZiBc9L#9cF2OicgXNz+4H)(Xrm)wJbmPyPKRYB@W4sHAAR
zI#9@*(t%wUgrj^UzQIPlCyL3zd+WGpv;M?IY&=)a`4rRW1f6rV5?zxvN_ric8(Q;f
zZJ~``#HA^?Xi9+?Y1{>yqZ3=0^mM@|=V48mWOofFgIA|XE+n3&`b$uk^2@&<=*5bJ
zA|Q8=Zn+4ZU!jYXm8w}MBtI_TglRFL$v;4Z#}q>G72EOT@){=tx|bIOes70dNW9ls
zz&2H&aF;-z@@po0QLw<Q6V4zY+n%l-V!YR}hN{5f<7~m79B$lQ9#q6@Z+27hb9OP|
zzCc)^xJX^^Y*Zyw*n<wFE3#T)I76YyS|zLnEqnbZm-ovx{s-!s8rz;Xxm4@FKQ*LI
zBI-^?22RFYh7QL62>=rV69X*+CoLnRG7|$A6Dt=3D>VZH7X!mHD2K!U6JTv?WNzaA
z{|^Y&^PB!J0PlY@DA}4gxf(ba1H|N|3B}B89i1Ev9n9^V2s!DP7)KsI(Eb|;kPwj-
Jt`XD^_&*m{`LzH5

literal 0
HcmV?d00001

diff --git a/images/scaling-mp.png b/images/scaling-mp.png
new file mode 100644
index 0000000000000000000000000000000000000000..75f34985cc528bee546b823aad4e94f5664e0d27
GIT binary patch
literal 22813
zcmV(~K+nI4P)<h;3K|Lk000e1NJLTq00LM5004Oi00000z+wQp00004XF*Lt006O%
z3;baP0000WV@Og>004R>004l5008;`004mK004C`008P>0026e000+ooVrmw00002
zVoOIv0RM-N%)bBt010qNS#tmYE+YT{E+YYWr9XB6000McNliru<OvHD83k)qKz{%L
zSLaDYK~#9!?frL{6h-$w4&PPPGn-hJoO4D=5=2lGR1h$uhzNqHnDsH|h^VND0Wl{;
z5fg%BkeqYQ8J1mObDo**s{8$;XO{&w%q;kMp6~Vk&UNW5-Cd{8T~%G-)Lo|p@n4pl
z;XjMBE^1N$fKb7IT7V-ZU<?RLI22I%$uuBOsh|K2DTD&F5kdjb?tKj?05F6U6d>}G
z8CjBvMc*~DxKuO{MckqoG`122ib<s5-jlKrW}yI0P*~IetU}MDz<L^W+H3Q10)-%k
zj*Jk+Tj{<Eh6S%ET1w2&|DGtzV_BAEDGh*HSvkjHNs`jCJRU*IV_AavP?Zw80J*?d
zK4p0<1yFe`1v**@1|K8=EtL{lB)utuDnRMjCFZ`bsH~$b`m5AJDX9^jP*_3CG9Po^
zabqcv%CbBbgk@QU>y^m2-q`NvXM(RS!0A4fS5ng<{Z0{<2#8{k*+LNjwNwfvwn-NL
z7sb4K+!6xFeYg_^g<3Wyp?T$YRX>}C1>+-C-h&sve%Qn3nivUCh`FgTm&cv@_U^wf
z>c#kIS>+AN@0Qu^o)Q+;G^j=hG0$)H2B9{@o2$z?Rg>@e*8TpB{Orm86ZC1HAepbw
z$TfF}_2tj_h>iU9fv<1XMSQkKom{}C9m7N{50A8rg7lStv5Hm{YVGKQ&8l9|bkq#>
z>DnlG-pP0;S5+C|$v00pT}SlR3ZRr&)+qu;24;(@mw-b2<Fk5$0-#8}W6`{BXCOP(
z(uo&Gz4C^0dL5u}Zc3dKTV$K1)XBqciR@idsZa6knwx?DWyv0B&bk(eAAma!tl0f5
zo}ea&Vni1GqtTj7*x4OTGdZ=|^7WZCq#<v;e{3_)B5IPP&4-53cW@+H?bn7$7mPW|
zEsx!bzi5+J^<8sYkAUHeO&5%4yq$&(C($8E%`NCbpKVBL+jb0T^YLUFQroso`z2&B
zpbN~?w(SJi4})lHt+h6!v27=KlPhm}^S!|%sn0}Zct;8iixk7`&9(9QuGQNv!(`fq
zkKoyK+Ml65xo=rC?jaw#Z1=_joi=BpA8k@|%EK2Ukq!?xcy3Ai$7$HJ_nO<U@X?Ue
zd>nJ>(<JHGwk91eevbjtIJT|Lk&45Z=M>mvoPc9<AFzf2t+m#qaqLs~M?W0fo(>I%
z^!?=eXU|4hNd7k~md!UYk|bZkWO5`vBpq^iJ!;al#813t9!W#e=GXX;ClHqx%*eyo
zL5I8;n;786{v?lVc@lHz<7kk{c$e$3lB{?ceVkNH(~vX_s9q%L1j}7J++EkbK+@qv
zWEQe6n4kV>r!F}pX^i{lm0yxaD)`T;LnT!#8nP?akz_k0ke}i;(lLClA`Q6_k;mgN
z%VFUc_Cfjm)HK|_sSA>5^QundZ&<?*pqYkI43G@)b1W+`jnsx5^c-n)5J{6^!Iz)<
zaky$nftk9{|7ovZh;Gz2AGRE}Kz6ZUc&fesUz1$;5U_CrUc<>z(SxCt#_)C!w8eNl
ze0|re{cejc(5z-nRJ$?59R(SD-Mpny0Sv^;(D=S5Y}Rev3V?;t9iSttkH<%74)k0W
zAAy@s{tkC6l&9^(pMI9Dv`GH)u}J}`)(lQvu)=}b`O1r{9OS^W=94YKVHoB1SKeNb
zsJBILLiYL<>vwNH1eNs3)ANNOK!x3pjzD5XZ+L)(K>*pX{pgP|NFRy2;V9y+&JeWQ
zec-!p@QCO@!KlWl&2{h1oB|EO3)<F!fOTO0)j(tXjQ83~9Z_onJOBjpaAf_fON9W|
zzWB;!DY8HPIce=4A@SRzUnWZU%i+6UPr5@Y3g?eUe_g5eN9+k;+w!$Lx9y}1{Pg&*
zS{eX=)xLYQ17qzS_mIl@X-P~;UE%*~#S##Z7A~l|H90xEeibN4(|#(2@YtO`%UJj-
zyaLo(3tpZu02xyoR|qDX-U<dD433?7EHYu;ov?(BTo6M0Y?zo9JK%>O=!nCxy_e1n
zzK+qGe(#!GW2Q>3`tI<Bv1_qpGyb0K!?=qYweRvg48A|!pzT+10BAAx#{s>cS*^xj
zwFb*O9f-ZJ-6pJQW%|E%2LK3-_XDq8_25Kk5hh|lLRz>K3vuFj?V7M4Ow(C}@R%nD
ztc&g8_X-%TH35T9Zg>@6_&tQM5deT(_`wV0X83KGsFvGv@(0&HCH8u%TLAA~T)y`;
z*CS;*4*aqM{);{tGN;y2MC|(M!f6fi1ObY`Z58gy8IFY826$6D&Z#!G_0J-??aHCE
zJ7v4m6e7<EXiUYyi66gQqvX;1|4DaOk`&z)zKB<78~SS?aW|TpLXED&V^Mgzpw}|g
zHgBoCEx*_5cmr+H=14rjC&svckHZ{lQ@IUoc8=gcoFHGv8v1xsYdRziGjSEk>h+l0
z`z!h|knCK6<RQGwye?xn9$%3)E~X|KV9#z@+_a6vcustfv+y#Vs4K}sSWAc0Ol(md
zxR>E|g*5DlC3GCFL~HIvjod<y`uG#9ziH=VHcYsY?2U;ozMGIt?!c2|8v3yE1+;l-
zWF8G^^CK)F=i_ru?&9YMSjQdsh~$&V;4iIDvQ=C*XJM*4T04S21Dx*T=yKfGhsV*A
z>=8i{V`#f3`Xam1#}9Fd&9-s;f3Yic>pYH}tXPH6)MCvoxAo}pB0z;<pEnpiL;1yD
z4KHqTztFHA%1F4tuK-Zx>}&=A5`*wX!j3_woGn5J1c=B00x&=nYyg3%LJdY9h8$TE
z^*bLQU1<y8LmsdVwGpx6acB&NMk)jVg?CrSidX9+2?E_l#`o#+CT8U2%$d5pu?GT_
zrqf-=zo`1n7YNtm;DW{;G=_pE#kT$fdh~dL&=I@7`PmtqWZB~B#+@6C7XeJ({>3Me
zm{6$4u>mkb`VkNSA_E0)_k-;sKgZ4QeelF#Y{PM616UIwdHa^C$QlrX9Han%G<6@p
z#T$Dv0vSNfv1n87M;urb`^VH-Z7ZH`qsGM+F3I?%B!vH~6^kH@ja}vitNAqtW~xBT
zAMaXp{1uu29&9)COWbrc0<ldSHmwOEM^v8Y9ee@=go$am3jm~U&@b<%w$&T~7!U^9
zss(`iJEIn&h@GMp1R_rPr=nqyg~q$9&3gF-%L0Um+I2h{9^^s5zy}d@%}CTr`TOy|
z--Q8Dr_xoJku<&rs$bsv>OCzXVK6iP5!}DhS`{{8KtmG?@fsQqNSZ7H4ZnM2(Vo|Y
zqcG_8J6_7X8ASCa4VzW~FzuNNm2R&-Yp>KW05nh?9*B5zIcf<I`#Q&%U{@Rq*uF-1
zs$0VkuYeZ{7XE>NSNH)42gvtz%FGdazHD_*JRnST<2z3NRpY}571gbM>-7Wk-BND%
z-W7hpL3+t}{@YTdyMeD`1j#Ms>SvQXqRSs3`8hgMlOGOZ9!H^jLC^SP)R60YR8OZN
z>F|<%v`ISL2<tdCX~=J|gg$<V1n$R=+<-}B)BxIs?H;F1(lA5T)?`E%UJUX8I*@I^
z_dX}CA`?b%A`Drr1vN>VxBpH)e=GCwJ9!^gb4*XRuR`+8<<!*Nh}k5YO{No0S7^wN
zSWEI-+{pkBRk5~{oQRer+3za`*b{FTWY4@IB>4iKAPw0Meg@dPKKUi~@nan2A&jR@
zw<{b?{*GnG@EG|r{^C-MV0XTYh2#(OSoZ;P7Jg-bG~9D3$!hC4GMfAfW4WpUpT-ID
z=kE)!c)F5?)%%jyA+w~e@PD&n*)O-l#Sh*yxba(l8qKUB%?7<XY~T}DkR$J|)?*WQ
z_lt?R;rW1V+dh7cC;$Up=r%{!W^mTu@eT$^hg0tCf!>d2k@sD;`I=!2@TWS9pYM=O
z%`{94j<Hhf3?2T%O*f};TdRBLz4L7Jn_OGr-aC^xv(A7CH_h4FA$oZFa}{H!(jjd=
z+V0yA??}qME4t(EuTgLKLls}-zMehb9yE$1srgdPm!=GOt*E+Fe8ti{a92Hb{eAcM
zZ8e3YrvLsXx4(RUzvnwK?SX;O52tW?CspJ2uj#V`4&QCnbprW)H*bxh`}yu&EuwFI
zk90acvZDEy4Di9OoBIxNUu5&W#t)8ozrz>gyLJEg^viZ_SM$LJ509q4tNwfM49t0}
zV#B|;cR{ag8X6`vzh}zOk2&+gnAhebcJO1(dNX%u=hq*2EX#EnW<E5ee8WN0$Ss{l
zzFB?3Szk_J5&xTOh5J3_a&o1v7Yzy+iOdAcX@Tq*aWuP1MwJRV$0~_5*$4nds-q-K
zL^-$p73tNNUhI0B(^ahd!0~eB(sg4A9QJz~fRst8vavMAn)KT4;EdU%RZdUzHilN%
zA82aD>-xw&S|iS}^nP6@HYKBcM*XAF+FOMJK%V!2Q@32;Xk12BW1Dou)}lasW?b!H
zA0SeaMMV7~I)@V13Vny;JQ)r|Q&0d#BIl@OH&fZ=oP#1Rt)b^+b_FM;Vhsuahm;*v
z6N$MM?ezMQ@mX=+)Tp{fVxNwyK>-}`c^U>69+B<!?z3xDG!khiA{+RQR;_?zc})N$
zX3C~G7!`<$I$kNOmInZAnQ_9?1W7s3xfOj@nvSfafYcM-#zAHJpsA=+YgEz-yF{Is
z5*JwiM=MsoI7$GZwg8}D?V)htoKaAY_4b-6M!GjB3O_(fcRfQZxXZ7do*Y=9)3E>p
zsij-0aF>g;D*WEUFFQR~jj@3Il@qOS>K}!q<cd`Y0#0GJPp#pcVL|@N3{P7~*Xk6`
z;9ac%$AS~I(4DZ+D8Ps!j5&STn%X++xnQ>F5?7%uQyR{HSEvll+3<pEs6x7bNKHW@
zwE!t7gdu<+T0@-r2c&+n+qj1v3BUxunu3Sv074K#6GGUYH?A2N$j48q2?-+%A%r6Y
zG=-qhgpisLf<$oHmjHz@Bn0wbE;WS+>T>m)BCyiH6w4b3e&QnNjdU?9Enom4@?kLe
z?5QgS`0`(ykJHT_2u&f4YoVqPbRYzTCL~P|L+~ph=;}QKSiWrx+E5(ALSp^br-iW|
zb?!<E%hKr~Ev-vj-UHs^?)^LBDUULM{jV2MS;SiV-+ez-&WSbQxu{DKqnfQVA!wwm
zxmujpH<$WG>%`KE&j0Bv|9c`DmMs|x2nYk<Y*vz>p~aHZ8%jw4!X#HMG|&?))Z!PP
zx`eTzppl`B97}($pa&?zo`QS9P9vBJ27oNMd1f+kva%|C<y@hjKM4b>`+%SUrDSnX
z{|5%|dG}2GZ<}y_rPvWZF!<97|848eE676l&YYB#mFSmuFrsn=Bja;X)WkZlGNoTB
z3Z`WDszw38oNkk|&*ROg8&`7TK$z5lGD_f(BO^Uolr_trbuaB$oZptlH1_O{v@$E!
z20(#-vP|XfKww{#%2EDi5ocsKPy39ykJ0Hhs+4SlqyA6&&a9(Clry;|qITq2_Y%_L
zawr_DdDfvgC%1M*07hc6H@o$PMrIt1v;ES!P@M4*m2F^1<YvEFD^1dQ?M3^nMQ%dO
zd<L3D<fVIo1`o@xzlL^aSUjbH2;{ujEinf^S8buxBmlx^0Mr`?rwG;OR~$^aaed(F
z4k-zP?uT9IT_g;0;yb^Ym9YDvQ7Zk>Ym!qy7?Cw7p-b02LmlL154%4%Ve`O0gaOG@
zIdK|PJ7;X0{W<Ul&V=gz8-52G?Vo}DZzF|`*B?BPedRF!woAS}6DpCty79Es#MQUV
zv=Vzfo0$OvEjRa%>)iR5mnE_jFa0X(#O(g-r9K7KO<EQ)<Ug%j(a*()XxoO4`=2q6
zF-F_`ryaF+7eMGx+gb;`w8o*)Wcqc?$tlwF?(mzdXt-$uHMK*dgQ<<7qv`OUV>Enf
z3AJ%ZhqN7Iv=gTJA&J93*OO1wAhXBNp<|5g7{^ePwlU<TzmVipc^uV`G^FMwkCNnT
zo$2tkK_sVQt?T?R+Vk#keruBZm(roJNrz58RITZ7`EELV<4<aBs5Oo47~|+LQG^I}
zTqfD-c5>4M>Qg1=kTx}0CxPUlSIIt~l7@!9@>5Cj>PM*A<2#ZMA;)D%<VzW+vO*(s
zS^Nkud;wUNpt6KfmJmV-AtaDVNGTn_^5-ZmEh&SWJA_IRQo~ab(a}X}Ch`pYnCFKu
zQl$tPOf5u;l15=p1c=eyv=Ax<%D_@WNcHc0(UmFzwty4e04iAtVJV@6k-lt8z=6p^
z`cZVxfnOsapyjNs@4;w1zWO%>zl~hoNBjeilc9=r_+%3SEJ}qi!b*eE%92tUA(Ew}
zz#IXASNmy5ku0SamJ&kAe*>yO9i;&cQ>!YROl5*4rZOB00@+B#$&~sR5!we^K;xCh
zH`uW8^r!I>CKuTlj;*k={ipVjly!_H;4eR=`)w@@@Z;2v>%6~WMC(_Mytd^~g+7Z?
z>s|-TUbp|~6`i#D!4mdepD%uR|L%7H7@7A)i=(x!OPqe_$98RsaY73~&3rd-?6T_d
zzqH!Xv(3k=AKrOrWa9FOz1KI*`|7(^<@)V;=;_|@ect?7^&5^qb>VI6_jKxQ&cCxq
zp{sN(L0J0cB^#S=eQy28e)T3j-FW@sK>CnqfEB>ib$fUr010R{J35jQtkCp_+rS7!
zV~V^NOAl4~@$R0*c&R1eS^oNM`l1GD-#0zjvFqa;HnrC9l*Qh?-7j)}8Cz}h^_fHO
z7>s?>T5Z4LqP4Hy(|G6hA<awo-G?m_y}SScQ?b{i+;XW8{5<)=l+~lW^K(v?zYS2R
zf@3+C^)d>=^~=v{1_2B0FzIP92IcU@1)a_GJo(Gm+WF>{vGLUWJ%vLWarPdXu8VoH
z#snU3_A;|C7#%p?GKsUjn|Y+mY>v5->@|mRgO2sepfNN<*N_8eGwJq%HvNpQ(B`K-
z=r|2lGr)_#=f+E!8a@2bRqS>fCtW~ptCOuYA9#U*yH=5R%;3Vf*<4)1DQTho93;R;
zYf$rG{RF=CFb7mf9XgAc#+#mG9<RFO)l0`x+Z=JDOTlYi+fDL7ZSs|FTNh6mvV;fw
zli&Xq<iwxV6;5kKn{EDJ9<Lk0#EyBZ@bj$Od3$HBs%4l|>ku8jF_ilTrZD-Mc#f%c
zkZ)e@a_-K*XkOT#dHkg#+3#o8>zUFsb2T#9a5|Y+tN(NDR?^t)^#N@fCSZU&$Z2*5
zdDUZ^mV7t#7^gf%zOp^PH#~<AUEzllV`@4OS8G&l<U#c-+9Ss4;Hlmv19>gm`0HLf
z1|tSIsP-3X-&|`>c?g4<vpf0G^Bq`1mH1{Q6g<<mUY}O)z}AX&Z?69jo_jDSUIHFr
zNsWGym@~6;0N0E{_l_5#me&r|=rF~dRO98wHFGbVfrdRR{?OYuzFDf@0PtV+;JuN^
z@8-lwG&KNVVPzwLkyXk!9N!JG8*g8H<5eZjk)%ZA#+kQgR7zXfu4<p&zu^AYviErc
z@CXIs<~V$sTq(P9jgQc$T?5q9_%~;X3NPIN)c*r@ZmKjlw)XhEeKG)040!0aKmk;f
zR-+ODP$lne0<c`wa$SF@g(}~^xbdmF`A`cZri1*Yw-Cet;;(1t#{t;@{05LM0HFa#
zKo9~roRPkD!LP$dLqx*{U@%w1g4&=xC9E^KE>aEb%bT}k;eu9RGy>@I`?44Ugn>eu
zjkyjK)J7+}Mzz!CF=r~%fI0JJ#sDTnekTFoz{$BYtYye6hk%Z3^kK4>kZ>-jG4x8P
ze2U&o^vD26fgN`McC{Mq+Q8__>$2}?o>3e-++C0}z(KQU$<#l3Bt&@Npv(LjF~))s
z1`6pK^SrPimm;YJ;4i*f8H4<(c%h2~bg1jWgyl=vJMY))zK1D?fT?Xhh7<tB#q}O(
z3K=vw_ixeuBnX2O901sjYhBdb(7ANp{2teo%ZFO1Vi{otG74{2FWH8Qk+6H-{Zu&s
z(SQ{LL=uRI0HVCE-_=|c*X)vB11`5=MM46QXr}>~&dS6TuR1ysf_8Z<>{IUc8*gcm
z3o8;9FS-7jmfq|X!KkD~h=@g74Zw7=ga?R-D9*1%p>ne2neBy?Frg0kqr4Efs(P>f
z*T%x*g;6W+9Mi0ZH!}y33c3*m5bcFf*n{D@;}EbhC1#jGw1t2H)T-6fkG=Q?iYeOY
z2mt<anA0uh_Urq%bG%k0B)T;p(8im7%yQ!FIq*iKwGV{33XuYc^cGi1jg%Swvz%U1
zz$h6?%@T-+LLjnZ>z)IyG_WFJ<hwt;-6U2eAlj=CTayBa@&Y({94{@}>G2bvUwT0R
zQ4#@4V@;~EI=%V0e7f?fJlE3q)>jLDO8<O5La;*}oUk7XV_T*I)>7<Wy%pGT6uEm3
zA+S9j*t^%ebM~wFWZXfl+lUiugLLs>z5i54zaN35&4-K0Ac0N0a<!<rC?%!E9gms3
zDVaF27BW4pE)L7(dz;ytk61Xg0ry<{yE^vmn>e{61xNQK7FVyFSAP5T+qV~<#?I!9
zZu{k~KSEh_&}Q3_c#Wt0zawEbGB!iuU`&+G$yl75w`WbR>)YLQ5WeI0#US?Qw^l`X
zw-5g&!p4>zMoJKwgR^mYi{iSkjeAcTG+khytoqdbK6_?j;P56$BsRc4&zc>v(aVk+
z9Nmb|)Q+en2cAUyF5tkvVwSOujDK?e=fAuVUnpORypOLla^d0-Y8LI({W}5%AN;-z
z+g2fYEhP5VMv6GNk{b^#@fixhzx@z$_CDAi-5(#A6k$!8Gyx)K*JdaIgAK!gUN!Qe
zs@L-kQ}LHv^-#+;vxnT-DBl@wNCB9$ry&&S->0Xxj&!`2REfdGJNKi*z6)1m=G0vs
zVKtr?$ZFx~%ca}?b3vea%>^es-T#S8sa(ehfa2z*N7oL!<irdsv8~tDYfrnSa72&5
zZ^xVLuT!Dpx<D`Q^_vdu>s)(QrIR&kMzvcO-@IwkHCB^V6|UU4)nB{jjmwX&8QSIK
zbZ>Ueg>`h>B8yJT&$FLwuWq}oDc#PQgn(%}t#$ozkG(2fVvT@1AreKv+&TrF4*BZG
z^q!U;-+%A%$jo+Z!>#-KoorVA``TuIy*MLsf5~W^a(MosTN(t~_5C4b!#|_0LhjFb
zwGTFmp%AE>u{QTk%hfF=_l+u4$N2Zo3-(v5dhxcjUeP_`ckXZ9a7LX&En{)nvc$?=
z_gx>+cC~fwo)tRgf*!LHcD-C<*RlvktgjQ%xX5Xtk<WF%<&yQ!y%0^Df@;<N<L1ag
zONQzE(Fus6LxG6JGwats=&4=1`RtUyHA}r&<r?G4ea8~-+|=vnU#%08n?x9qFrh-F
zl)W=?Tb$Fk?u49qhkvMpV-u>Y)Mgfi@Yai8SM%C}rYe6@VA$zt+BVr_W{@-<+!X9?
zFfMi~QLxFj2yyav4dwr@V6SZ<DT<oxzEGc)uuxKcwx+Yh&}o^@Og{P({Ui-_(5PpY
zGxu=5TAdHCBp@M8t8DT_(rHhhw#B$;YO*HtcP1AD)js{7MVhI(;UfC|?7f$|5L9wV
zI#W*Mr!=KQEi~sW{vkt~JeGMH)HCWxe*My7-rC0?EgM69ur{us(jsr!mTfswtY2Y@
z@XBdh1CWM7P}4XL>1Y^R!#El=G}<UwTGI$RPVg%e)YL}7*u{>WjpJxaqYb5SjG>_8
z5JGDjt)a9wptdRVH8}>_F1o1~Mpb?ZvFO_b-uSggg^`&H_bUKCfs~3c`%a2OI~}Q|
zv;A1+Z}CO+jWmv}2}Oqnbl;<mG1^c&j?OpOP}@erGP+t7ZIpY>#p@Vjit{`f#|}aj
zQX4~I91RGgX|!fORAaeNsU4yqC(ne6rpG3Fy%-P!xo2IfF!JcaWJMsLOkR29?#bew
zeL`rNu`TuJZ+S&Q)sABb%V-+7c}~X|qe*SMXkcuI#<3}2Y1WE$sN7@#AdP0BX;MMM
zg(av20SiNW^NC~8x35tLzpQd{u#<LUni}Fe+YsgQ^P~WT64J7SuoR?6L0Sqz3aKOj
zrG$jAto+;%N+=1);@N%DvXp>SQUI2eBA8k!A(aA@k`T%x3N%>~%2QmRD!etW)*VxN
zcgDD<tB?CoE_kKUjK=^7gVF7@{Gr3NLu;<fQt|hsz4k!t+9Jl5LRuar0AUFU;l4*n
zDU}q;vQ)moGI&cWQpw;oSAivEvGXBUxeHZEDIhHcKq?`X68TVN5UNm?z^R-(!%$uB
z=|Qa@UU^qd-2Y>h$L7l~eropiC&B<i3>=8Svptudm@ph_;q$pS|7h<nVt^}@Wl7<_
zJ(%ahl2R%mlqZNrdMqI=j{u+qVToWe_ofnsr%46vZf~TzD_%_S?Bw<T;oG;73v%vz
zgdF<JzvY!<F<s&L5{Daau7}RtRX2<8bmxD%Fuhz7`Pg#u)-<x`D6-B!eB(N%UEt@x
zvJ6*9O5O>5?%MynssbM6S6#^KUSiu9`Bx)q4ov5vK_u&~;L(a1>^YjVFXz8!MS+}Z
zv93iF0Lp_WY9{Yoc(f|++g<0jsK~FQR<Ed5I~1>+(FXj#pwOGpuPU~ie0cHdxj#2A
z7kGLOKKg9h#PY$mGJ-(8K46p=G7rZyGRo+AG}`=o?1Q4A7W1}#|D=61_T>2MVgK<4
zHeL14{AvIopuJ#D=MKm&FClVpG&kI^5@mrZ3x2y}Jp!<?AP*1j>(Mk9YSY`^en+cC
z*Nb@VKe|+^wxLv#qF^+ZA1SVESzb~FHBz7LXm^|<rqsmA-;f)){FO;UfXa(muQa$p
zH~<ZdHUKOLL-hAgpY>DKQ25MeiH1~CpW;tC`P&Xc)=-j1+DQM|sC+158-v06Mab3v
z=`lGH&=Crw$6wMT4=~g)8bT`pXoLHfE{p%OY-A}7sU!<QEy(4fq@ZHyR<A#V2zWEH
z8;sfe4D#X<p8adcPmis=GoxmQnAN3v6O9z>7OxXHB+@gGmXhpPKt=}AGKHOHQXPaO
zIwL6$DJcO^6c3#;>GiX(S|j%r<*f03t)sn2x#5}(h!CrA-xIBc%$<+OILPRVKva3a
z6N}|<x4Nf$S@If2ndM8is+>fTor;W<<UC6-H3uoVA}>RymXK+Iv}DIgPBTCW{Aev#
zes(4*BK@U6Q51*lFA%LD`(OMIBDwkGE8mq1ls~prM#Lem98f;egE*@^_J39B^?GH5
ziq(rZ3g?JO%R*{uGA$xK3#r+{pJr0enu`_qv83*iJubTZ!C!8zU@yEdK5cvBor~HW
zOu6dTvB|Nk>sBtMpD=RUJyo0d){NfNd&8thY9=21D5EVlO&VsEua))d1Lc}W6u%^&
z@5j<|Wnahh@iUE&J}|fcJ0IUxYy%=D_OEp2O^Q~Z^gKI(Fs~&rYt8<L8>aozVxy|J
z{aD={$5NW&Enc{8wD;l-V<InF^?ubWy;sfqcv_Wa;Wt@E&V2g9+R5u*lpS`=yelf}
z$d|h=#;UPTS1DiR<k*+%H$J1C&-Y_Fa@=FFzDIs;y!KXJ((lk^PBEX6%$@O&IwR+W
zE+6&lWoLP!j6Csrbf)k38ojUl>qzBQHR}BXb!4w-xMxhgs#8DjRPWDkn%DnnV(SZb
z-LUk%WfxW}3)EE))NhcrZe)D-wLd&vFMi(#&V^Y2{rxeqHPXlbr%F@LnI(K`xaDxy
zE8qR*r_F=Q*K@<k^phJ_rjdc=tB;j7uGKtJX%0yz@@ev9r93hVFOmrreRSyioYb0j
zF$2c@!Mnp5Kap<<^2k$scL1G$HjZty<B&8?(5pp<^FL+42^j4-)Q%>NLr2@3^j;A&
zjLh}^FJJwRG+ei82g%hd<8xM-O>0)|U%z^-y?*VAliSv<&iHuB>NS6LK2+?G)(PkW
zGaO?Aj<Ma~i=0#WAj$psjbx22B<oy9&S*?JJTURhTq_v7JN$SpdD*4p+z)umd-Mmi
zcAQgCgWe&VpDd(p+eSOuIF1QsV}LKsDIzY!;k`-kzA=r4X8GzPWX|$cna8%;tJbbL
zux7=s+!gEAq^@1JCE=~bD>ja~rudR{Y@;1bQag0016td`hy~j!tR|U(cgbDVNG4(o
zxveVcF!k%R6&0q<IKfI%I6qwXXKC{#!%jn)XM7wxoIG0RBy)Iw1@d6sG#V!V9pn}k
zU2dl;*5X~{$QbSL&%hb4G^f&Q-oCZ)O}C(^nLV)hnXBs<EY6j?`!bRou%<Ye`C%4m
zPIprp`s2P}p7C+`Ck&XztC-Etk;1>*&@tS#_{`ETs92hRWg7PAPMtfMXZR#_!8;63
z-gp`$<7R`y)sLL*vMp?6zHj+#`F<ElUhONTJRR!O9mj@j@(gp0k8eCnZmXZGb9r+^
za$`fAhViq`wx=-BPZ=YiB%}eP5da|sNC;V4GkId|Yw!pUy2b*6v53apfq)wbT?Sws
zSsA^dYtIs=021w_kuVaboArLBd$xbyzw*Y^!H0Ir8<hx5RoT~8rU2y;yOojSk3P65
zzG|Pj$t~L>Yk`SZ4+UbXY-@M+T7{9CLJ1cWVB82+?(U@EuW9!58y81xYTmJ!F|Co)
z*WZ$Mv`R0Cys6%-0lA;FYcsQFa|)xDX7q+vC|nl>B}@!rFuhwPNCQD30>wfV=oaa2
zoRU|jkGlHmS&JS%zRwI^daUYITAiF5ZQmvU7`3d4W&9siyLowH=Y|Wjy4S(c710|z
z^)VgB`XkQLNeT_2Alw3T_h|<q3SrzQ?#da{S}eFdqSy%aU*%o5?_`^HTE#Dn*0*i?
z{d(`pK@}({cJJ#4udvYLH~>T;3X2~BM1BbuQJjVUVJX+^DJ7&3LP`M$St{YdU>+=h
z677cqLT}o)=Yx*p3O$hf_Vw3C>?wJwBv`(jI|LL}Mf68#oOfsLt?>gAZ=L=YaO<9a
zu@8a(u_po=9lm+zqFy84X?*$6c)V5X<`!E(9A8|->|8=w1(*OSg^-X^3Mun{E#CNS
zs|Lf~C~gofaM8yvHoxhQM}TXS`r-acc{eBQ`Wza>3;=+I8+t-Q>^Qt<f-@PVi(qA!
zH)JB>Djfd~ZB{;9tKXjwN8R%BKPvC4YW=p3aQB()s0tI+^6BYUw;VS4in!Ymc&qRA
z(Ys;P)~#nl6;ik^+f&6>09ClZO5)+Wn>Kyrp5kbD6wRNQ)TP%G->TG}O>SBKQM+X~
zm;ZG>3=C#N0#xYMpOAsAdv=ew?v0XsOXr2rD-aEUB57CBkzBx>IDVk}Z$bn4+fVZ5
zyr1@b?^3pxi-K?z#KRr!$X6=05$z6LjXP4XVDWRfVa{%8V6gExM4;8}l`GXW)tl74
z<Ty6HeX!@V#=_+(Wi?+%)ooO<SO%T2YhQF@wdk?mB+TFPC~p2Oa7*pWJ{=7UKnw(c
z_`OL-8~{nj4>g=!#STuuOZtlPp$a&7xf*K!r(B)L8orL68;@XAL@!+bshd?R@&I$w
z#SNn_Id*x4zSSLj-g_xeUk9S&@iMw`per<}A60@It9ygn<7&M26h_wUg1)10McXC|
zZh`_34FKZ!?xe#$KqnqQc;k35xS6#mTh7Y?0utTffXLW*?@be6Jbk=?5fW4u0OdjD
z`PYuAGasd}wF}~-2xR&s4tD{v;{bTDEOV0A`LRAE0FMGBoXY_d19%i>Uwe`IrBOiD
zNGq%T3d}!TrD%A05>7<inw=@X!kOevs19QR5a=}$AkzPNV&qE{>~R4ABS8-+OOA>m
zPU$|Wfut(|UqwJ!Sd#pAj`_{K86g0H>aCF*q2NRYVixxKq~<>$YL=NrS_)L;I0mSj
zb8KDm^i1;$((AcTOI!^DB4fpU_l<*&>IIh$Vk`wf6`}CFR4idCT{q%dX*X%|pSltr
zmE(1!1QB_3&qe;o++^&@-I+4-nF*zSWz1g-QyvAc19vsbLYLUV$ah1#D6D{MyYyhr
zjI1dycegOP#W^4-<$eSN&^)H{017|`cLz#ztN-pe$F#ceiB@ntRvv66rAG9&+-P59
zN^Xf7jj{-%0FMXo<^|D6bjiEH9}x@@<kn4LyT|K@=tHU7Q{H{%7m2^G`nRA;5efLq
z*KW7_I$tDU85jjXvSF7uXV7lm{DCHbN7{-)f`~qFsDVE+E29WSWw-D&W2Z%pQ68k*
znLv6{HlzfEuHN@`MB=md$h_WEeyx0|#YYQCSvdfK6WQ>m0@=wxUOIYS@)a;E0ffXA
zl^}prACQ~_r1)_6y4X8AWJy%${;$Pl5%ZxwZrQTO1-}UVrM)Mi+^`c^vt(w32iuSa
zkMCp#lmsNMtT!5wzF)hPWDY$@N`i+(;5d+(1*E3~>A85ozc#}B=>QPuZ2*BJ!Z{fL
zPQrJ+UHR1tkZ98G--4<#mjG|vg$GaUj0{YIbE4oyi;}<dc#vQ+fV8AsDJ39Hv#xI<
z65n(_+nfRi>A7XR5kqEvw`c$H%PV(U^u0P*p&kVw#BIBO%lxkArLd3d&D(~=71<S@
zBKjG*WAfUn2mSS}lnI+zYkYiMx!>~Y^OwZ9Cbxc>wY_;03J6sG?%HU~S(?&(!+PC*
z@rv|@KC^$#HBF58eMr>5v(bQZLSrmT{5VqHG}gbVlXqHjVn+`JAav}l-~4uTn`%|R
z*~W2!_{gSiUfeNldHhfJ*4Jm|_9>zt#j;7qV-itc?*DyD{k+L18l=n(wA(*3wPwpM
zAB&AwRG<KaHEDp%nYcBp$zSnSvxz(XwX*CjTkfnN_HXF_FKl-M0O9|p+tK;8Uendu
zjm74^$5+N%?c5vMGx+xj2X<$6t=@Ld&z}8NYEb|X16R*T`L17EeYQC~KcdNlt9O$$
z?nxW@4v@pElU#R5ZfSljlC%RPeWbR@02#1JCqVi&=?mmX)qHgtZ9f@s+~)&iu0xwk
zAOE)~7!EK0lUXhsFVCi}^G^&R9dgIo!wf?97v#mRHF-t3nhLQS0*;H#CXL^PYWvB6
zCUbQ@H}bBdbOO}2A#IzsA#+JTKl<ihf@<h*e4Kt4SgxU;r~KLuxp&Pzy5<%t;r*-A
ziy}gvx8SzMCTX0Yzq0T}C-`46?#I(<GrHiYBiAQd=r^BVnetTr{h-6pklJwA&VP&d
zqS@?u7h2Ga!n{d`q;_GO(=VQ#MW@qdv@U>ZTnMLrgEv;a?fNS7^Uux7<bnT6NCA?D
zC#=~t4hzgE?25MYp_(E}_)Ix}p(3TGaDxgKT6~w9QkdYSB3~s9Pa~-2CpKsQ=RW0B
zy7s+iT`mLZ281MxHRXmDw)$5{J5<YtFusmU7%>1(V;6{1uum~cgr+QtHb|aEJ1GQ}
z1yN6e!9xHDyYt3BZ*k<Q{BvLY%L~f+mwG~b<@5WxMT`8*<!OahVhNv(^<S|t3^79|
zgqtKQ_UvnUQQ>O&FUXKD!?iN=>;Ouq0^a^Fi?!#!`+lnIwA~`Sa0Dizh}~XRpo|}J
zaCc5gXWSZt^AM$qu+p|w3M*73;Y~dAc}f`wJCJDzAYJ9gi1QPWOVmAExhU|Z{$IrW
zGHI1d#wyPG$&a*M#ABtZDZKqk0MOCeuK?|3=(Ng0F|kDv7mJsaiY3;57ApNyc*BKS
zSWECn$JhY{Xs-#AFH;cVLxhV~<ehoN2uTs)#sSYOb3kK*5O*4@tOtK6DKw9Jo->S)
z1c(S9ErP}hwF?g#Vj^s<03AWU>k}4Qdh?_-QF~%)`)Q)nWK`%QG-CiLk#8IzjPW`s
zEyR#D_`_<&N0|tM(jMuTlGJdwYzxZ{r!CIc;)UOW(8}^j$;h3RtNCD}lOrO+8it;V
z(kdvZz1pZ?IB<{AC0k`h7J2!o6_p`OY{_j&DF{RSHlnQ1#vQJ{<SHROe0IVV7fZIM
zT)~Z$$}n=u?#A95fSqrLJ<>Zva-&Os#)?*@f2wxw>Eo(U6c=HRaDk>(1jU53s#SUd
zE5G~toFMv7RhF~q6=y;{^CL^y_ss`!{={!qe40~kY`rmM8{3Pt<T1nB>rlQlp^Yi_
zOBXt2{ftiuk#h9YH)?B@F>8DB;7jRC|0(JbCzRGyq$PiNPY12+FB%=JeY0czWI0(6
zf`CyQmp0olsDAE;b0Y7{vn#*)E<#cWKwG1KZ0NK2PTS*a=Y8UyQJ{{zf6cDDE0xmG
z2<l=NkA*+&uk;rjp=Kpi+V?>0i5n^km`mQko>i$$u8s_~7({OR4^@XdLjCH<<cA@b
zZLPQE?&`<CU6k{mtlaXijz>sk5)`Ivn>5x-2-`E}iVg@o-!=Z4mRf=`>)S|w?ORLM
zb*BGe-CJ=1I-GkU$&>gtv`xTgxkI*3@~jWhtyJRnFaPQXno=8X{4^qo^!=8@Y1qbN
zi(5RM@1iT+kahI&&G+eCwtzE-ki$mNCaHTTa7H&yZ%6)_$f<9J?sByGLi4VdUC@!f
zRd-()Jmkr=oXCG}DP>nx$%l=nEbDwH{oLP|eCICiy|8P?tD`4z#bp6I^3PC9!SL9^
zhOg7dr31)^UuH_L9FFh9Z>}JJP2&%rgy&av);wF2Bpq&-JIU*w;nAMyG}QcPDf!Tn
zunrs7OlaVv&8e;F^RZozvblH-*?jf+s8|jcx1pw{zy5J@1GbT!hB3ego(#<}L$+8&
zX0=Kr?;gl2{w9BJPD7jH2GFr9?l^oU`Daev;Jh%l+U1TT`S(5sIH!HkefdW@l1*2X
z47ucE<uqTtlX<-NZE|Y`PTxRgOeDLGC#QtI8_^iv`3il!b|U$A9lrJunG~C{_B!%c
zCJrr@SX#7bZf!==Fz@HOTeuf{$=)B)reV)?^0#p%7QxxAnY-aaNJK<@oBm1J7CQw^
z;5^MJ(+vQDR9gV8KCJ*`$H75%h}s3|7eI1y4A5sij>H1pQWFJ<eHI+6_0)QGcO==-
zFAj?g!=q;naj<kxV@D!A7!Migji8Y?C6qWl#aXJ61pxbEpf8MFeC-7e-tLCO>zd4&
zd1IxJC#=Fjj)1^{SfEQzPP`X^Dh;N0j7>Zkf!F>MYK*D^Npm@M^SVcAWLB?Cc>aQb
zMxFXAPHw!XL~6|0_uP?&l{8K789T9+Q#M}B`8!KQp0dh#9tP&@{0A!ZS|A_@Se6E2
zJ@7g}jLUM0APmYWy!cD20W+en+?mo<hC1Ewc;L)=3CiR%7X@l>$%;j4Ahh(wSt<b_
zLc^$NKw4AV*1=JG@4!`k{)!CQ(dXhtgabrJ3a^1d4Djri9d4Y^<f6@amxO1Nd&t-3
zO0{wXs-4(zDU#f)-yOQN{Q8Iz>qc=6lCE%l_ouhqIo2xE!N&iN2n1liyt-C61JJ?}
z^bDM?ve|35<o-Nz<yUbtLRaKqc3PBuN|8eE9zSDJZdteI7J(GNcU^#JcJN+){KYaI
zFI^BWL!#a5*C)<>X8LbYv%|KK%2|6Uta7L`{1+_WV*xOj`k#gR+RPGNK1I?U02qY|
zrs6|Sm@-GYKEG>41Oa;S%zGD`@SJM|;AjiT(6B8a(?@6nzl1~<5wP*fJ^0|d*4Cbo
zn?Z@M+_0kV!k;n30`E^9n$<ryk1`3sZ$X+I!t+mF0|x=vK;-7IRrxI-D-Qt`1nJ<7
zr!a0nTi>y;*6;`rAH?ik8(4Q3Uv>YcYPUL|qmm|F-#w%1?Im{nXSZfi5s?6*JTMZ;
z+9~VdBHqX{COktTy}_SIP~yY;F38;5L0b_KAy?dpNT6!v%os7V6Y6FFi>uZ&7SJpn
z%D3Yaz%wo8DwMwvvA&R9Zvs1OK>&!106__w<98#FJ)kUy#^nnU8403U2Gqv9&J?=<
zgE}%25V7HCiHLwf<8)wo`RKZd@Enit4p`%xwzI1>MoeDls@%*MxF$OZ7W0sBE3kJA
zEMR+enOT0ufD%8Wvs<h^hxP;5brc4Fyg$lYruY#=;-T}j2)X-Ewj{9k@Lmvu{$4ib
z&Yb}E?G0^KK^#A{8-bW^69takgQ33wtA_zyzd-+kN!FUS?X)-Rw^x9ygUIuQ+TIwk
zd!Nfr-hD7n8gc8hBK?9b_~6l+Wem0f0Fc<b4;Jp32Q0fODu7wJkWQU@C&NyEF7zps
z#QwdIxa$vK#ZV017!WJ2sB3$VEqcm$l8?a`-um^PLnno4cUjj?C#$zW_ir)zvTLxl
z$&vOulFMW6?Ik8XlR+x}nc46_T%AAHG)OzNX8zZ=l__2~#pWMtCZSnK51$q)@+?hL
zSJ&HHy{cF?x&G0JhAIDewjK4|UMuB?yvP$xLm1IgZ2qxkGFzbUw&dTt-{5pGyOw*O
zg;>_7UUb*FC+3Z)NcrsGMt&=DcI$nYhgz{H(q}we0m4k)P<L<5Dp+;#g%Od-ORGFm
zS|@!*sL0cGW==?oi*FKByYBK`2S$1!|7p;NN?i3@`n0=ylpd2MU(3mF9V@wc#03%C
z)&yQfrMB~qu6nDS5Kli|-B!-B#>XxRXT5*sq6Yga*Ra*`?<?-FSG_{&u4C^2sRwR}
zYyb0+WtB>9u}bN3|0M=&VJY!U=wh-j+;>pmNV!Tz8kyv;<F2w(I)|FTNM~&qBaK|0
zd}UO~dMVCWh&B?(yj4rftdP!xHZm)@Vd2H!Lser!tyhJ$q~z8GwDKKm3?p;5U91(w
z(%h@8GM{Tfk&;{2UDVbJM`9|MTn=YdEE-A)8Y)1`GS@3(M2Jq4k}H;L4xlMiFmjF(
zpbnYG;u=CqXa#5mEd?AY3r-X-bhLOFne4JGT;sHm&<dp%e<>9!2uiEMadz_!4<q9X
zYjMvWcdPyx-$N)cgP9LwgvwV*twd0DNpjZU-Q9HN{{I~IhLyca7G+L>cP;;F=$5&L
z2tg<S%Kfnb$Y8J}caykbIL^Z$fh=fTA;sqe)di(kctn+Nc$j$ewJ1;sxb+Gm^v^N)
za#dJqP^GW{iBhdPFZ&ArUl4KHPw;O8>7KVxT6BtHw3T?}94$RZ#p1MI2_pp5xs7c^
zF@F~+MftG+7+20Ad(IpEoJIbG=Ors5%pL(~DGdQF!Y61GXR168fKNX<^z2uZ1#0jQ
zBZVfyA~zJFjc48{Ud|5BB7`=Ou7rHB&e;Y@o)@1zo!~i1g4#XHz}?Oubm*oT2$+Jz
z000d^8nt`96o5)j3bSSvs45t;oCm<hbym0q00UukzNI1S&=7_Nxds5jkObtK{Yn7p
zaBj%D3e*q+K~wVUK169B(wP;D^5k|Uoz1JatXsckV@Sur7iE#i*(A=(o*7}~X~4*w
zT|yZnMZ8}c2$PViLN<*u?~o&nk>Y56pi-^!4hkXXeCvPbM7D4LpfFKtDGvH10g4<G
z1e|@2fC;Ix08k!3proNp-6EBNR)K?IG^i>&Q3^s>c5vO2ekp_iK3u!~i<}Hhdp0@@
ztitj+07UK~C_*cJAb3cUz|quF+P*VvHgfCQ_-(JW-ZS>^>6vk((q@!lZVwn4zc^vt
z?Kg+ip_e!-)CcF!`vY3d-+bAhH(ZKCFWj@X$0c-@Chk7*Wr)UT_4n@DhpxC3+uyox
z$<3|ZIpEyH0Q*<G=7UymRjpL<oqGm`wbriHfki6gind0GjK{xs0UutHxNhjRrW8th
zNwdP7Bj2e}j+EbeYtDY6Hg>N}*!V<O+q3Y9?ewo4S}7R$hPTzOXDAMC{PMRr3K)5?
zeFeZ7_Cn8Pc)XjAdi~cXp^vi6!R5bCQAWnE&YgMlZ2|9!J=%A3EeeXnuN!9TM@;BM
zKh2es$d_X^=eHwC&&aayu?&xmBj;jwNQ|r!`B;HnFSek=yereWy_IH@@0rx?BnJ*9
zy)TB!$U6KgnLfa$YyHgwSNLh@@W(5eho#vyPH@>)Zhj^_Hltxut6l6qkhabLG<8D_
z9{!x1io<2+V;QFH=rx2suDgN!`dV)Mp8OOUoO~(iogA8h4nJ#2J~N8;?yoBO-0h~D
z`aE>k6HN}yY)ei}<&R&7Q=ZPQQFUmTS#=eE#eOd6KvwvQHZ@1=C7+v8qE)9?ESt}l
zC-dZF{@I!Vve~s|saOu@;t<*5iI5VPLiAhIngM>&kA7BPvfZ0**mW#FdY-KOR%oVa
zF6cw@A^)86wr1;J>Cmw0GIFPneG)kH0Pjc$ZwqyRkKauuB+}-N*E_izl&2t(Z17Ua
zCY46?d*x2%@wN}h9p%jD@R6}!u;om0cW$Wp-th%;PE$HOUO(5}vi-+Va?f<`y^fq@
z4h$_e5^T{~x1}i!bMKtX1vtho<H@~QBn>a$NlqAFV#PYWHB)$Is=<EK6bexOXRD1b
zi+4t$$Nb9Rkqg65dbVT`M6!WFr<E#35pov(^A5gg^FnB*30%Id)y5yL^k!5Lgq{H!
zvEhLEdt$8!^!YP0x#8Q_)hu;;&1vGnl-g^4I8np4PMo+r#|7MHejEbFE<6XPgV4wZ
zfMf}$W!1$G%vDIt>X5M0n(<TRs8FW`j02+f?pH|H!IHSCw)TH{HJdb8yr-NP`8=$!
zM$H)pf>A&A<ml10j;$>>cS3YI4QO<2`Spf_LrN@w(<_!>rS1-%xTO`m$x|kO`p=7%
z%M?`~)|HWXyV~97>@&<a&~kIBe%b+v!xrpgzb<$~gtqllXtgM{{pYu!>5&A5cncs-
zGVO+z*QDX$=)dn=A3t#0Z^uRG<8DIdWVE^K`9D2a(A^iDczdt3@Oq<x=Mr2fNc6J^
z!Xwbr;F(K19zg1~9j{FcZCXIom<=2V9)9kM((3Cz5e`OdJaFw-?OH6HmK#pZk^qD;
zYsXZ53TY;#*OmSDDg%Xg?!2$Vt#C?9dK$Vs@AwLT7&ujKy7{RM_dX-bHi!0EJ7(?>
zWvi7%WjaAbWJW&uW%iUz15fP%pWXIPXkjO)a=zO6^O}BWcmKHTNj?n{QKlU(N*e**
zJs4x&Kk!bE<*DJcS;|)}5?4QfrL|yL!Tpmy&xU2&a2@~)0+2+ucV@~mM4>fWn3qCx
zl1Ys5uRpj5g^8@RFs&mztoV5EgDW1LH0P3;;kK%jvcppyTO-D_MD2=C!qDKe!N=OP
z-)n^vgVc#yw(&P=7()2)#%O<;h<-Zq_(y+5g|*SLSX>0)_dp^c8XpXvc6o;jv1T7C
z^y(K{^Pur<!?-(6Ec*dp^q$$Pb!$ioKk@)|2*!uEu8sNX(yA>shUMS%+knYD@hexa
z&04fy0ML;~znE&BW2g=nY6Ps@bT7(nnLdrM;REF1(9%oYY{9P70aY6U=2XCuAEM$k
z1Z}+a947UzT7GvJ9#;cE1Qz;n@3yz&;0M%E2%!$G?p15^?ZZk`bVgQaYwxO$uqqz0
zk&uyC7+ofwpHaSVzNmI^)A{g5$3{V+Lpm(8`vxp+HL%j&d(n9&P%bXC3?lY4LsqpB
z=~#HlZ80YXK?+<L9SyJsOyrsqcR;p;H;i^!7+mB5teEOgR;gb*wq+~;qq4_;RdMf@
za|0X`4WPqOh#le*B+U5C+IXOCOca3Xp{1ufvU5;}rPo9RP(C6iAO2wl0*>d)i&~;$
zJw)0eS1h7@Bn&2AsS)Xos8l`@KvjS^9F1IjS}hEspHYX}t-brsd%ak&J2f@so!`F|
zW#Fz+qQlGq?HCg0NAB*OdLHP>frZ<5JGfD7QM0<X!Yy+!`pG7Gcq=6T{AsBD!_^ZR
zo0>2LH>|{{*K6R{@@cj2{}lTpUqJ*ueFcbKfvm_dItb$ZISv-SheoYx<n4B<q4T}c
z*KXbecVC-zE`XV{YF)a(eQN`lewRGiX2tNk2IfRPF%F4=5urBDlkUXc_>W-O^Y<?f
zXx!fkz#n-4jA#FvhaKc>LuPcyP=!5z$EE--7$z{b`XhM#OJtpR8|&2m%M!Lm<CnWj
zPJCKc7`b+eadh>^7HpIaW3u->SSrV;$Se}az6to0_)o1+{CXujNM~&^v3;5$?Qida
z2P<L0PP^s+t?+UA)i?AFkxio2(zP+XxjJSY%58dsV{KUR2+W9@$+tBPc;9M%tL;g8
z`QG@xrR^9$UE`#E<?Hk_R2vs(R2~}pdE2gsez5(*dZJp1fJ<_aQe;d$f|@s0!H)Gl
z-y>rCBppEPld%|CDgB04p+Tic`8D$R{gt&^ywZv6(E*du&#|6r@TgY6{ZBQsJ<BH#
zI^H*Qm-n0vI;Pi!*fBk3`&$*Y8Xb{yRVN(&?a|1jUn27^?owjfGv)|qzBGJ_Sz#?9
zK?sHDXC%@qz(@obVI&2zVna7gD`dsONcfaTs~|DSRdBE&U}WmH4l!XZFc_(w2(3=V
zV9B?oWGx{P{j_3sYyk;@0@>w4hDw3V@~575&-rkza^f2pY2<;EVQv5`oPWrThL$eb
zD<`Uy-EwCvWe9gA)G`J_kRgPyC`tr109xl^QfLgN6w-(aMo4Hcw1ObRs8>*0#u`ZI
zNN5FVj3sDdNd-0pAWfyq!(6zMS}CpE6RxEaj#TFZuqK2Q%E(wPrMv#C1Wh@Fye&zP
z<&98)F%(Kj4FwoT09FGdAx)!3;XrlOC?jLFkbsVcR-m>NO-yM7oHN(Rf;_Qv<mrV$
z#4APe?Lm;Y1_gIJY#3Euot9WiKqv^2&)1V=xch5^s)9JJ^8+}DNCFBF!CenRg%dJe
z1f5~|Y`>sB0HDif3~Mv=smxcWPzx(0tNwjX`~S4Ku|Ch=zU}{EanC&XKSNsFX+5s|
zw=<)#MgM{hBZLNE#Cg&FKMWku1yKKwY$&Q?5fH9~c;xQCm?6-?6OV*rf@e7c&<JPK
zxiK&VNTKuHhK_asBQuUE(8!QujO0Uev||7mBA;)l9epmD1pxU(!JtaxhG?eHVL;Ud
z$@8<{HH_+5h61(p`^%j9g_P299H(TwMO7?8#-GSMa`bR|$N?FP$folYk68%=7=a8+
z8VKo=N(0IXW1qc|GEWLph&1IA{fxAf1zT=^d+m=}NdKVZus#C{OIgwnBZWW6H#Evp
zR+%C!oD!j2E0iz*v_wFHLZqqC34~P0w<x&!No6U^(PGMvvt~i49mB)%s}y8@y5)vH
zF9;lIS(a7uu=4D?T8+H7#?kNBjLC`ltod7I-G*j+zH8d;+;=6K^i9*pz{vR7yVc`0
zz}bfm^ysVAHvv;&aEPX9wRTx_-VKeAHglWr-V2R{R^QLMYgNVvc<r&vpZO-P^5-2J
zghTW*a^K^($eC}nkTdpq^^o!u(CU*lgBOZX=K?j5H8u5VtxWpH8DjvW)^Ezaq?5>+
zxg~o@hY&N;$SFrWyepbQtG9;Lq5#FSdDl&D`FPGpbG`Rw`>VY2ZWz;PrtO+E+H2(T
z**nZ54UBZGIcc|R1tTBYRmD~@AC~AlIg>|Yb6QL`Ir{>V$M9pBBKnb<iQW~Vr7bz4
z-^$@l=`dkB`FIu0>wf1ftl`g-$d@ODrrt2^<uq;|%ACo0{2B?=wE1-XRzp&mSpOs!
zZs5Lq!o%ipQSJ7ZtRlC)t@-!UbV!H88(sKd&N)T&OJ3aRP6p^-IlLLQ!>?}VY9#W{
zEdCK9W0IQd-y?s0iH^;g(OK@1yS-YreADLCj^yu&{A)y5M88!})uds{REHy?a;VAb
zU%RI;-`K2em&&(?717TyeN2N)a%yY(^4Pw6Sr2V-Cf8kd9x9f@#ck;4N98o>oyf~a
zF%whRZwtBZg;0rp^M;YUFMS!}$wn{Grs1C?N&B+nT(F(@Z4U42(Oh<zq>mrGOLCZd
z@eeoiN|X>H`q^~2s0|(Z>+WUN1a80%vi<PTpxS(M92vNqn&}_aPp2W7{WnR?O+Cp!
za`WyCCvc$~(a+`tl}%O{%K#U({Dw9S4~!={A+tnXXLkWhdVc~pq91-b@lDzM5n9dd
zT<Kf~v#z8YDBNs2-W(H&#lV;zH==#J={^5?C{&`~#Wy-%Ex0JI)w>ZuAQAwD*p?P?
zv9_*W(*Hn@ADXv;3a`hv4yLAE9E0v1{I7h`AFd02M^d>C=Q5WUj?4-oCXAYv8}(bm
z!D#t@EZ`6EI17R9-EyD)d=QLWa?MA<Xkh-`$7;6JZEGKhb)udc8i}wDYe>^j==C;0
zCkmcr4Vw4>2vontv3`t-4JYuL7E78{@{_QVrcZtE*-00bi3wuVK2<C09CG<|0|o0n
z4ZL0RW@67bYyXDx(j9%vT~Q~rV3b0apFbh$iw79@Y$g1b_!uOBLwjm1AE@y7p6_l?
zT<~P-lALg68bt2eC~@t6wEzB9aj=iA0AlZ<Ix~i!6S+YY3SLO;F?(-a@em@Pu7mm8
zp9$^rR{FAW&xrIZ^Q@&!s`%V%o-IjrKOU*@-1sE}W=3AMXLVR~ECmQwd)AAOtf5Fb
zUO!y|0Hb59#Do1!$>Y*98tbI-JXS}O(^@*o{u7P9DB~FGE3(;|^U@VgYeh|y%Ps9w
z=`c@Ey29q_?^o5RbR=Y^`p27}zz8}%X13c*hcvWFHdsQ^;k~<a?^!e17alxKK5#cr
ze7N&k{NZX0kgU1&9D>`@=2LE0IIT7P{4m0h-dW70?zl9>LNGk>(atL|gC~FB@jAA<
z&BIR~s*yvIIRm-!$@Mcr?+i7UHzsNFc)0CR^80jl4@S`r@a=J>bkApg3S(*>+G4Ld
z0%NJls@gq|vJ6B&jd9k&9sX^{&f)t#T})K^N0TcWbwbDi>xN^@)twW2^a<_T)0mLi
zX3T=urYSF87}v%qIwJcozeel?*a7uz^Jb$OcB;QRjK+qTc>rkgC&txmG5xav`<0_b
z>fc(!nsjb_EEd`c?Q;U~b0a)>=B>*@^sN|t(746?(Kjc5Uj5K!d(|<i9dU3TP$_vj
zYzMF0gZHj$Zw`dfI=gPHNdKD?>)@}GsfV)<B-n6BPhi?rrHn+*<h=KdNX)UaGZEo|
z0dnlJ#A}u4+VE(m^q2YY@4^z{1&lJEjIF;X`CyH$zy)bJqD~I}sFod44jHVy5}7?l
zZ-aRM@vccbAS^7sEm=Wc2zrk!e;RgI#QH*aWKr;znSgGB9X*lz;Lts=6lV6%R<JHO
zmk16-Fic^D7eJ%v2u7eBtdEEE%G?|(_3Xfv%zNtA?KejzW?MN~P!`5L2&kB*pfxS#
zD#W!y`P?vkb`f5{NWA~{1&0rJ=x{hg=u{0$5Xnbsmy+~McEe@BffH@3<KQtsO@Dht
zL>Y)Pf)@cutJ&&&Y=u5>#1{$Y()N?`{{GC<sVVlYT^p0Ovn5vcE*Fw_N8*A%x|T=A
zK-|pg^X3k1HRIJ)t)H)r?Wsdy;jO0txwA(&$7u<Qr>_Bidl=P!zdt&oS{K~W;O*A0
z#9^7Z?Oc{#B@QM*0XT5PAHmx`S#tT-dV_G+vC1nJ-gr?+P>rhm=bceW^)8MA4xeaU
z3nMm7zPcXpQ_G9AH|w|WfQ%zmeIW-f1aR<3T19~#b!X(xeBsi1fG@fgwhf%p7)y_p
zR<>tG^mF9CiRE{;ShBEo=E2p|J|A4Bh<-xLl<%#Bb!vyQG*wDOzopgpR;#*kRnCUp
zQT=OO_G8i~qq^hrNr%^F-ED=GLz10N&Of~HnpQ{WI_vi44z70Kc)vLOoILmSMz%Gs
z%Jl(Nb7HskU7|u?h^A=uP3qKML(zNXhTZE1Rmc1eZE9psKECYj8l{wf$r1g2jE_lZ
z9Al;|tG>5J6?7VBO?#()(sZufn|?>Fke~|DYw?zyn}<{isQo|3?PyV9RpstYwlCdt
z_`O)lX9qR%TbSQ=Pso7_fthn_?yFp5=Mvxg-565d<W0&s8Qt{6=0OqIwf%8Xik>2d
zfQ1|~|I%T&g{8#7`ze!1(qZ<_Y&wQKbhK2jQK8gXyZr^@U-u-5%-y6mToNEPcl??d
z7En#@&Y&@5(%#@1x0<PYO3kg32k)ngL+*46j1Do8Ay4dc&jJqGX~@I_q^5r{H4T^i
zog0oPtB67fRu5g8nj*$2)KWrI32IUHje#L2E`>p=DP?ed)`F34th!*lrw~QEaiKH;
zC_!!I2lsOi0}L~oAKV~Qjs^GVJ((ZTkEM;VON;1ds6;;Eg7|~3(V=F#QE(?@)KbDw
z<!4u!0v85N$&aH5I6{G16*^C`SSi96AuI&>bmzieJ6~MCLWH`4NOBSkXEdYS*nRoO
zNLYX(gaU*SBJ5~>K{G5sC^uqTFrwc%e1)!Irv)!mpip7)84<Kd6y#W;IDEmtg*rS&
z*6DE+-M`$jD7AY2zZTK2h`@ZIr#kC`EcE|#G~@g;=*pzA*ieIoIHmt5#{K^_gOo0<
zi{~x+;Qy|;?5=PlA)bSr)nd^I%3yfiQb!HN3XNd6aXpLCls_*u5`b1(3issPkVhWw
zMhB^YQv)+CC?o~^5r$AAOrHuJ1!K8Hy!`z+6#86G>2}9LG$OcI;vx<wBVDMfAZBj?
zcp)82YGJJK_FcIZfJSL407sepj?|zutIoV@gpx`y7a}(=r_3?xD3lD6>dsX}Fsg!^
zQA#UeC@ssfLXDe3VOf^tQ9O0TF{z}Ij!1p;_r+2vytqG1qMt`f3ouejl^@Yh3M-7r
zB1MXnbPK@cSGrrblvE+d&j>=5=2X23f|r}#ow!3uEAy$a$cRD_pio)~!dOBDAsSh#
zRcEwVMjl?^FMmV1rPB`I<BRy~`u=6&>QQ9>o#nZ*!MPLtXyT_m3;>SxXZwc4pECRE
zTKI8G?SSPUR6mpn4#kO)4YY6?4WPCb#$0S9C@_DI|E30buJ<hu9g3^;i)j|lcTAD^
z-L-PUunXk==?OircB7|f&tB8`u84C1Y{aVlgAHWz+~f*FK;c+_bZh{zXjk5~A)068
z%AcF~m9NL8##j@ts!l<1a%4RtoJIqlnGpThE&dv>-|h{A=x0v;abE;Q&XlOU8_Q9C
zU(SDHth=#k<(AwY?Ml{khWjD#%2xl($I0ItlU#;{Wm<&R95Xv_*O1bPZ?2R~^qYT2
zQ#Z8P{z`|*5o5WcM>2;YCK`3JLsGA~1CgHc_>wmLwdT1?#!vpk(G}COnx5pWWx4xq
zb;9!PI9!BU^}o|RvNG#f-KXe~v==AX4>q(*)>AUkFKK$4JL%)*M^kbx98T?UU!}w3
zbJO`nxe(XeZT^6A5m8vr0M}JXqoL+*5$P$9pI3Dvf7~0G@nAR(J7?kjHA$QI&ETXP
z$mee7(t&Q4j`)Rtc(;T>^wXU43>^+T$Z2ip&ta!B5&aBL+(z=vX(83hQrHduv~lhK
zqEb8)I_8p{zT*4zb8?8}35S0!B~!+chSMr8p+iGf9nBQ{!&Zr0wuLXx3lF5`vW+Bd
z_Pd9iF4Acn{$1NoMtyS*5&dZM#Uae&pWc&vC7Oo(ywWkU-mA>RZ=rG;#~mX#{7Rep
zy&9P`bokd|GW8QKy^Nfa$$P^JUZ}ag88ug2L9*uB9Bd%3{+SLn+ucXvqp*(dZSnDE
zYE6p<FcRMAjoTk7%cNJJ+PdLL%f9A(%x-fH0F8NFs`#-jBg)3*({K~KTlaJd?bLaR
zcAbE+ePD~_jhp5H5WwNb@!f@2q30g#Zapt>HNxC9qD1n3Sm;v;sLnJ{;KEBXfy(DZ
z=OBE9!cA#a@#|Z#F|TVC8{GX0er(h~w2YMOSqr}3`a@t|k8&D-M4QgQ*uJ=|ZmvHg
zc2u{T;gx5uhQTTeezcjMQMIW-#3TR`tH$E*YWJ3C)#+m_sq1|?Y~;0HYh|S@)w^Dv
z($fy_95Pq5`nA)KSO4faA4fb107mWeT$m~06%c5+2~D(``ap=aR<IGVs9wW>H|^x5
zX>J-u9bPei1@OohM|wCvjK{vl;qWR6-{MP;zB;bLz3@l8c6kG>z(#*)%!+^D+#>o3
zKtM)q__^QX0q;J~1(^~iGJe&cD?**hiKbfpbe#xzw^nSNEnM)xlG=^^5qH1UZornq
z&u6C9563160Feenz1+B@T^$if1pr2zSo_(Mm~a@R5*2=L@Xag*MmDb5=dU;Bd&+ib
ztbfwrnO~GWlg~18L3h9MxCed)Ecn#ylqz<JRYA~TTrXJos~4o36-3nKJx0Q5G_qI2
zmxfu-Zk-*@hb+-{=w$-~qu}>N%s4*F<+uivd0|GivK}g2NQg@J?_2IQ3%UoC7eM4C
z*NqIhz8aN0>r(9vthzYQ^19$PKJ5jsx%%nrBi?`K*jw9|h0Pfw064zrOXgzVnqb(i
zh_(YqA&jGYwV1y2))k}ru7Vw1t9kuytp=hDL_f-dpAGnS#>?a6`JufVd|ulhZxVw3
zD?cQD>Job6K(#ctwj=+PB26a+K!+05Y7c*Zv=y+b&EnM`@aFPs!piQ9@;18s3>A5N
z;=E`IM*|)__j2F!;cgb9L9_OkWq)7Gf7B%8C{(F4{L@!M-8oE*ZvcZ&n*<Vde74XI
zsFr8frI4m#<fxt%p6%TC<gu_K`2uhV00GRjzfZWh9B3PX3U!D6aUVht6-oRv0C~?n
zvk(AmA6g~llp$|4=3j=)zEc;UkKNEx0jOlpU8~n^tr!i1SskFTtX1f3jK-oEXhg-#
zTM^i?$_PvBs<jgp+bjlT?!;G+)C<+p!y)>i?N|>O11syJA1dwi!%{|wkNV!4eB@j<
z=@L@H#`C>_o3XuSVD37-diSx~>kxU_qEK}h%xMgP>REF)=#@L6EE+71f&&oij}C$A
ziYjSgl!{OwDn&qGX;)Ny?Uu{42Eg*j^oDbQz9n0A<~LSh0isa;ayZ%WL>_u{NK6?b
z`U%urit@-lFJG`f6P6ip)AfS_{aarYy($99gP`^$gwaDwY;d4V-~KmVi|&_u)(uAz
zo8iQkCjg8X5(}IRtGN<D{{g_PyK&;C4%-%t1>X5u;;Dnx*NxtP4%B2S<YdFbo+iL|
zbvpO$cm1HiE$!p&mdIZAWQbo8J=kdg47%aE{((WA@bhZ|*x*2b3ZJ|J<YpkWSkpnV
zGvUEa$C4wmJvX5HujMzy494NRQ*tvd1pa)glw2<?7GBcdE&S@@OaJ~Q?#I7>8vg0{
zveXS_uI>K8qMvxzd6*T>m{{(Oy)uA=Q8k}CfUoB79yR158^`Smp{o!$SQ(HSX<t?P
za4q(&u)B@_eC4B`Txom%R6YIZGZVX7hJn-I>2H>Q+UYU;xohNYLykpaUIO;cSaidn
zi8ac;6it!!{f@*hGBB~(jGs3zf*%R*)O`Lx=U=~BegCMNLS9<SJzfQXjl&<-_|H+S
z*a_lbB>==5y=(Z;^>>_X5;9aV@5{<B?m)SZe%?Le^$O?|zhKg2iIe{XYR&$2&3(__
z&yuV<JEGs7W6f*noM<c4h`6#Glj};to*FGed$L467PF$QY%7u?;E(kQugF_lwIv03
zB#g-oMUtli1>oeC59lZW6uaZ=HZzdF^o=9syu+8Z2xkIoWZI^hEecjrDS(i+=$FjL
z(%Esyh_T?`n0sl2kqG!>^JE0}9;wwbWaB8}t408T&ncH{c?*FLOd4V&@Qb;+dgw)(
z8R^ZI(MG0hX<Sb$Y)QXN2G68Dbf9wElGC0!Dj;Q{0Fh2v*HsJ7#VuV*3l&*7?xc}r
ziC*dQ`A*p;L_SmRT!?<EKr1QFdP&iCgb*r{0{K~22E?bGR1LW6R|OF*&v+zMeL>D)
z0aFkn1gp;c(uAOqs<^u$i=;VkhxZoxl?)yk8<vMm@U$*Uy5?Dc&{6=>Ak4^X!Q=(3
zEaCorUMN`tCjY!w7qJP4Kqv|_Mg*begLmtJ4l~G{008Ay32EFAe<CES&aOJ=N`yS0
zTFzE`8f`7C>?z7TRR(4MBD$imh54$pu>`>v23$tC@~kUi8S?+h|0m}`{(m|#MMlv(
z0lfeK03~!qSaf7zbY(hYa%Ew3WdJfTGB7PLIV~|UR5CC+GBr9dH7hVMIxsNy0yuI2
z001R)MObuXVRU6WZEs|0W_bWIFfuSLFgYzTF;p@zIx;mnFf}VMFgh?WhhKos0000T
zbVXQnQ*UN;cVTj607p<vAV+9#b98fKa%pgMAUQ5FF~;=!6aWAK07*qoM6N<$f&pQp
AiU0rr

literal 0
HcmV?d00001

diff --git a/images/tables.tex b/images/tables.tex
new file mode 100644
index 00000000..86d744ef
--- /dev/null
+++ b/images/tables.tex
@@ -0,0 +1,40 @@
+\documentclass[multi,convert]{standalone}
+\usepackage{multirow}
+\standaloneenv{tabular}
+
+\begin{document}
+
+\begin{tabular}{cccccc}
+  Case & Hidden Size & Attention Heads & Layers & Parameters (billions) & Model Parallel Partitions \\
+  \hline
+  1B & 1920 & 15 & 24 & 1.16 & 1 \\
+  2B & 2304 & 18 & 30 & 2.03 & 2 \\
+  4B & 3072 & 24 & 36 & 4.24 & 4 \\
+  8B & 4096 & 32 & 42 & 8.67 & 8 \\
+\end{tabular}
+
+\begin{tabular}{cc|ccc|ccc}
+  & & \multicolumn{3}{c|}{\textbf{DGX-2 (V100) batch size 8}} & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 16}} \\
+  \hline
+  \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
+                        & GPUs      & Time (ms) &                          & per GPU   & Time (ms) &                          & per GPU \\
+  \hline
+  1B & 1 & 1121 & 100.0\% & 71.9 & 1076 & 100\%  & 149.8 \\
+  2B & 2 & 1093 & 89.6\%  & 64.2 & 1026 & 91.7\% & 136.8 \\
+  4B & 4 & 1238 & 82.5\%  & 58.5 & 1162 & 84.5\% & 124.7 \\
+  8B & 8 & 1407 & 74.3\%  & 52.2 & 1343 & 74.7\% & 109.3 \\
+\end{tabular}
+
+\begin{tabular}{cc|ccc}
+  & & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 2048}} \\
+  \hline
+  \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
+                        & GPUs      & Time (ms) &                          & per GPU   \\
+  \hline
+  1B & 128  & 1153 & 93.3\% & 139.8 \\
+  2B & 256  & 1101 & 85.5\% & 127.5 \\
+  4B & 512  & 1242 & 79.0\% & 116.7 \\
+  8B & 1024 & 1380 & 72.7\% & 106.5 \\
+\end{tabular}
+
+\end{document}
diff --git a/kubernetes/deploy_k8s.sh b/kubernetes/deploy_k8s.sh
new file mode 100755
index 00000000..90b626d4
--- /dev/null
+++ b/kubernetes/deploy_k8s.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+#  --- USAGE ---
+# $ deploy_k8.sh [branch=main] [n_nodes=4] [name_suffix=$USER] [image]
+# You need to install yq
+
+# Check yq
+yq &> /dev/null || { echo 'You need to install `yq >= v4`. `brew install yq` or `pip install yq`' ; exit 1; }
+
+WD_BRANCH=$(git branch  --no-color --show-current)
+WD_BRANCH="${WD_BRANCH/\//-}"  # remove forward slashes and replace with underscore
+if [ -n "$WD_BRANCH" ]
+then
+      DEFAULT_IMAGE="leogao2/gpt-neox:$WD_BRANCH"
+else
+      DEFAULT_IMAGE="leogao2/gpt-neox:main"
+fi
+
+BRANCH=${1:-main}
+N_NODES=${2:-4}
+SUFFIX=${3:-$(whoami)}
+IMAGE=${4:-$DEFAULT_IMAGE}
+
+DEPLOYMENT_NM='neox-'"$SUFFIX"
+WD=`dirname "$BASH_SOURCE"`
+
+echo BRANCH $BRANCH. N-NODES $N_NODES. DEPLOYMENT NAME $DEPLOYMENT_NM. DOCKER IMAGE $IMAGE.
+
+# Obtain wandb API key
+WANDB_APIKEY=$(python $WD/get_wandb_api_key.py)
+if [ -n "$WANDB_APIKEY" ]
+then
+      echo "wandb.ai API successfully obtained"
+fi
+
+# Generate ssh key pair and post start script
+echo Generate SSH key pair
+ssh-keygen -t rsa -f $WD/id_rsa -N ""
+
+post_start_script="
+cp /secrets/id_rsa.pub /root/.ssh/authorized_keys;
+chmod 600 /root/.ssh/authorized_keys;
+chmod 700 /root/.ssh;
+chown -R root /root/.ssh;
+rm -r /app/*;
+cd /app;
+git clone --single-branch --branch $BRANCH https://github.com/EleutherAI/gpt-neox.git .;
+"
+if [ -n "$WANDB_APIKEY" ]
+then
+      post_start_script+=" wandb login $WANDB_APIKEY; "
+fi
+
+echo $post_start_script > $WD/post_start_script.sh
+
+# Add ssh key to k8 secrets and post start script
+DATE=$(date +%s)
+SECRET_NM="$DEPLOYMENT_NM-$DATE"
+kubectl create secret generic $SECRET_NM \
+  --from-file=id_rsa.pub=$WD/id_rsa.pub \
+  --from-file=post_start_script.sh=$WD/post_start_script.sh
+
+# Template k8 configuration
+yq e '.metadata.name = "'"$DEPLOYMENT_NM"\" $WD/k8s_spec.yml |
+yq e '.spec.replicas = '"$N_NODES" - |
+yq e '.spec.template.spec.volumes[1].secret.secretName = "'"$SECRET_NM"\" - |
+yq e '.spec.template.spec.containers[0].image = "'"$IMAGE"\" - > $WD/k8s_spec_temp.yml
+
+# Delete previous and setup deployment
+kubectl delete deploy/$DEPLOYMENT_NM || { echo 'No previous deployment'; }
+kubectl apply -f $WD/k8s_spec_temp.yml
+
+echo Waiting for deploy to complete...
+kubectl wait --for=condition=available --timeout=600s deployment/$DEPLOYMENT_NM || { echo 'Deployment failed' ; exit 1; }
+
+echo Generate hosts file
+kubectl get pods -o wide | grep $DEPLOYMENT_NM | awk '{print $6 " slots=8"}' > $WD/hostfile
+cat $WD/hostfile | cut -f1 -d' ' > $WD/hosts
+export MAIN_ID=$(kubectl get pods | grep $DEPLOYMENT_NM | awk '{print $1}' | head -n 1)
+
+echo Copying ssh key and host file to main node:
+echo $MAIN_ID
+kubectl cp $WD/hostfile $MAIN_ID:/job
+kubectl cp $WD/hosts $MAIN_ID:/job
+kubectl cp $WD/id_rsa $MAIN_ID:/root/.ssh
+
+rm $WD/id_rsa* $WD/hostfile $WD/hosts $WD/k8s_spec_temp.yml $WD/post_start_script.sh
+
+echo Remote shell into main $MAIN_ID
+kubectl exec --stdin --tty $MAIN_ID -- /bin/bash
diff --git a/kubernetes/get_wandb_api_key.py b/kubernetes/get_wandb_api_key.py
new file mode 100644
index 00000000..b67700b4
--- /dev/null
+++ b/kubernetes/get_wandb_api_key.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+"""
+Get Weights and Biases API key
+"""
+
+import requests
+import os
+
+def get_wandb_api_key():
+    """ Get Weights and Biases API key from ENV or .netrc file. Otherwise return None """
+    if 'WANDB_API_KEY' in os.environ:
+        return os.environ['WANDB_API_KEY']
+
+    wandb_token = requests.utils.get_netrc_auth('https://api.wandb.ai')
+
+    if wandb_token is not None:
+        return wandb_token[1]
+
+if __name__ == "__main__":
+    api_key = get_wandb_api_key()
+    if api_key is not None:
+        print(api_key)
diff --git a/kubernetes/id_rsa b/kubernetes/id_rsa
new file mode 100644
index 00000000..e7ce6d88
--- /dev/null
+++ b/kubernetes/id_rsa
@@ -0,0 +1,39 @@
+-----BEGIN OPENSSH PRIVATE KEY-----
+b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn
+NhAAAAAwEAAQAAAYEAyr/AvmpoErf9HDYo9drNqLc3cIlSUM0ur2r6X1HSzwduZMZxfPLL
+ScxR5d/nvZqr2nhiwHQtlZI5zXnWKm7YlLQgwBYLyy7z2iKSVXg24BLuZ06VeWmHeByc6f
+CU+QkMlBbHpkU3miG0oiR7QJoaVWEb6c7nPwY7WG4XAU07SC8O+/2LqDS8iIgGNH5aVd6s
+gOoRjBdGIM1TVUL8eyj36sHd6UIHCiFMWlJdMBi6G5F4u5u7+u1ReI4rSx9RexP9I/O5n/
+XFXomjvDaW/hYM/s4/WKfrS52Sx+27JDAeL0Oczx7493R4RX7uYTQ4gOxUjzMh9IIeeMVX
+n3kBH9rqsOvFFruDUjCUTUYHPvwmPkCncOzOB4k0o8EI1I9w1yLMlK6hlHgxi8V/Dmcc8E
+dZyFI6wbqe72xzCqf7g7ho0h2S8YA9G6xd4xpB9NgL1CSD6vTLuRG2ZMa+z/tHwx+v2eZE
+49lsHy0ZUCKOxZsnsgYmvCxdDKpMIPUbQ4GPxGkBAAAFoC10efMtdHnzAAAAB3NzaC1yc2
+EAAAGBAMq/wL5qaBK3/Rw2KPXazai3N3CJUlDNLq9q+l9R0s8HbmTGcXzyy0nMUeXf572a
+q9p4YsB0LZWSOc151ipu2JS0IMAWC8su89oiklV4NuAS7mdOlXlph3gcnOnwlPkJDJQWx6
+ZFN5ohtKIke0CaGlVhG+nO5z8GO1huFwFNO0gvDvv9i6g0vIiIBjR+WlXerIDqEYwXRiDN
+U1VC/Hso9+rB3elCBwohTFpSXTAYuhuReLubu/rtUXiOK0sfUXsT/SPzuZ/1xV6Jo7w2lv
+4WDP7OP1in60udksftuyQwHi9DnM8e+Pd0eEV+7mE0OIDsVI8zIfSCHnjFV595AR/a6rDr
+xRa7g1IwlE1GBz78Jj5Ap3DszgeJNKPBCNSPcNcizJSuoZR4MYvFfw5nHPBHWchSOsG6nu
+9scwqn+4O4aNIdkvGAPRusXeMaQfTYC9Qkg+r0y7kRtmTGvs/7R8Mfr9nmROPZbB8tGVAi
+jsWbJ7IGJrwsXQyqTCD1G0OBj8RpAQAAAAMBAAEAAAGAUvjOJB7foGlzR8xSTurQJy9znL
+ZV3YNai6c39zJBF0BvSQZ9AylJUFYxXeWXrNxTYADMSP5CeWdyCEjIAG2EbLaagIGFKg14
+9dgYEqozqteUgOqD+T/alKkA/ivai1/Mqf34jm7dJlKM3Z4YYoOuKyqmhbSKphwixTmoaH
+v2ur+nXjCQqrGKRkWEZI8g4OtTpR4lODN+G4ZriAT11mEZtBTKf97pipe962ixgjFcWFPc
+GMrOzw23vaJQzVlvuvKedx7Jm95LjCNpjeNHOVbGoQMCt/zSZthRJdJeSORnfXe5qJ7Nnd
+DUeuDaNVQNuGHiJqki3Ifq2bM2M2e6D00DU2tdNNNdyS3unzxlIyCCP5IcpdXkaZVVBCVE
+X8Ns5glUtDO/S1+JQJV5IlBFnggZV8G4to5fJeH2gPaTBTATzc1CcbzMXBF4Y8Bu2wjRmI
+UscBp1LGuGZhaso+Bg/89hOFu7F8xxmjkb6HSDu1bsBCi9o94ypihogPxiSU8+Ze8BAAAA
+wHtJrLT4rPplMMXlMVn00IocK3J0GF37Fz5+A0Z/ixd9zly1j0mjAY6EYzIe/quSQmN5Of
+m96/xYgfXLOKSommoE+xdaROHUe0n6Dbu6tn7Z001W607XxW7Jc1NueX5xiut4YEUKq7b5
+M1wHpPYj/raChUA/LD91+lAod9roDNp8+lEuTvGv7iIQiLXKV7aR8OglS3hdJ1uH7XDSmu
+W06gfBAA8x4iJjo0XrH16Z3JexjBOk7j/4O9p8bCMXa0omXAAAAMEA7tIaSY304KtjXe7Z
+Jqj22qSl5nKX38hdrrMJ+bv5XY7wvnnvCA8wSQuC11BuMznBIDnCYjxdfRD5SMhb3xbOr+
+z0VLKLmp4MGnJYSrNyEZ6TITOOMQSq3zciIGM+fuRv8MjEyCpWhaDKF5FyQDL9mtLuCqJX
+PUdGC0GF7qp0MWbWqKMB/xleUGTQ63O6/lbhdwuvLI4JhNFz8hsROwt3+9pwiQPBQmYz9Y
+hV4Al/qHhEIDxqEy5GWIcLphZifr/ZAAAAwQDZVWNebKF34j6F8Rl3/npd87q1+6aLE6Hu
+PiCVXxpNOh8F6VxI1D/4lPhnWmNJ2FmzDoLCAnxbhJFI2azvwYa20r/N08um0FkcYttJS9
+JMSAJxhvt/+/EbrtEVdVgXWo0TnBCAt8zlZY8hIAr+wMn8cb26CyE1IbxgI6TGi4WK+p41
+PyTOUzHEyy31bTSu8oH8uMNLLlCnU4ix2BJ8xiEdQ5QGddIzdXmON+fhE5c1YNCok5rP/c
+XHd4lk+Sw64WkAAAAoc3RlbGxhYmlkZXJtYW5AU3RlbGxhcy1NYWNCb29rLVByby5sb2Nh
+bAECAw==
+-----END OPENSSH PRIVATE KEY-----
diff --git a/kubernetes/id_rsa.pub b/kubernetes/id_rsa.pub
new file mode 100644
index 00000000..99fff16a
--- /dev/null
+++ b/kubernetes/id_rsa.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDKv8C+amgSt/0cNij12s2otzdwiVJQzS6vavpfUdLPB25kxnF88stJzFHl3+e9mqvaeGLAdC2VkjnNedYqbtiUtCDAFgvLLvPaIpJVeDbgEu5nTpV5aYd4HJzp8JT5CQyUFsemRTeaIbSiJHtAmhpVYRvpzuc/BjtYbhcBTTtILw77/YuoNLyIiAY0flpV3qyA6hGMF0YgzVNVQvx7KPfqwd3pQgcKIUxaUl0wGLobkXi7m7v67VF4jitLH1F7E/0j87mf9cVeiaO8Npb+Fgz+zj9Yp+tLnZLH7bskMB4vQ5zPHvj3dHhFfu5hNDiA7FSPMyH0gh54xVefeQEf2uqw68UWu4NSMJRNRgc+/CY+QKdw7M4HiTSjwQjUj3DXIsyUrqGUeDGLxX8OZxzwR1nIUjrBup7vbHMKp/uDuGjSHZLxgD0brF3jGkH02AvUJIPq9Mu5EbZkxr7P+0fDH6/Z5kTj2WwfLRlQIo7FmyeyBia8LF0Mqkwg9RtDgY/EaQE= stellabiderman@Stellas-MacBook-Pro.local
diff --git a/kubernetes/k8s_spec.yml b/kubernetes/k8s_spec.yml
new file mode 100644
index 00000000..c73cdf90
--- /dev/null
+++ b/kubernetes/k8s_spec.yml
@@ -0,0 +1,67 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: eleuther-neox
+spec:
+  strategy:
+    type: Recreate
+  replicas: 4
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: eleuther-neox
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: eleuther-neox
+    spec:
+      terminationGracePeriodSeconds: 10
+      containers:
+      - name: neox
+        command: ["/usr/sbin/sshd"]
+        args: ["-D"]
+        tty: true
+        image: leogao2/gpt-neox:synced-deployment
+        ports:
+          - name: sshd
+            containerPort: 2222
+            protocol: TCP
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - name: secret-volume
+            readOnly: true
+            mountPath: "/secrets"
+        resources:
+          requests:
+            cpu: 30
+            memory: 40Gi
+          limits:
+            nvidia.com/gpu: 8
+        lifecycle:
+          postStart:
+            exec:
+              command: [ "/bin/bash", "/secrets/post_start_script.sh" ]
+
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              # Edit for different GPU
+              - key: gpu.nvidia.com/model
+                operator: In
+                values:
+                  - GeForce_RTX_2080_Ti
+              - key: failure-domain.beta.kubernetes.io/region
+                operator: In
+                values:
+                  - ORD1
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: secret-volume
+          secret:
+            secretName: ----secret-name----
+            defaultMode: 0600
+      restartPolicy: Always
diff --git a/kubernetes/k8s_spec_temp.yml b/kubernetes/k8s_spec_temp.yml
new file mode 100644
index 00000000..07fc1729
--- /dev/null
+++ b/kubernetes/k8s_spec_temp.yml
@@ -0,0 +1,66 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: neox-stella
+spec:
+  strategy:
+    type: Recreate
+  replicas: 2
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: eleuther-neox
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: eleuther-neox
+    spec:
+      terminationGracePeriodSeconds: 10
+      containers:
+        - name: neox
+          command: ["/usr/sbin/sshd"]
+          args: ["-D"]
+          tty: true
+          image: leogao2/gpt-neox:main
+          ports:
+            - name: sshd
+              containerPort: 2222
+              protocol: TCP
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: dshm
+            - name: secret-volume
+              readOnly: true
+              mountPath: "/secrets"
+          resources:
+            requests:
+              cpu: 30
+              memory: 40Gi
+            limits:
+              nvidia.com/gpu: 8
+          lifecycle:
+            postStart:
+              exec:
+                command: ["/bin/bash", "/secrets/post_start_script.sh"]
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  # Edit for different GPU
+                  - key: gpu.nvidia.com/model
+                    operator: In
+                    values:
+                      - GeForce_RTX_2080_Ti
+                  - key: failure-domain.beta.kubernetes.io/region
+                    operator: In
+                    values:
+                      - ORD1
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: secret-volume
+          secret:
+            secretName: 'neox-stella-1612397160'
+            defaultMode: 0600
+      restartPolicy: Always
diff --git a/kubernetes/kill_k8s.sh b/kubernetes/kill_k8s.sh
new file mode 100755
index 00000000..00e0dafc
--- /dev/null
+++ b/kubernetes/kill_k8s.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+#  --- USAGE ---
+# $ kill_k8s.sh [name_suffix=$USER]
+
+SUFFIX=${1:-$(whoami)}
+DEPLOYMENT_NM='neox-'"$SUFFIX"
+
+kubectl delete deploy/$DEPLOYMENT_NM
diff --git a/kubernetes/post_start_script.sh b/kubernetes/post_start_script.sh
new file mode 100644
index 00000000..1ecc952f
--- /dev/null
+++ b/kubernetes/post_start_script.sh
@@ -0,0 +1 @@
+cp /secrets/id_rsa.pub /root/.ssh/authorized_keys; chmod 600 /root/.ssh/authorized_keys; chmod 700 /root/.ssh; chown -R root /root/.ssh; rm -r /app/*; cd /app; git clone --single-branch --branch merged-training https://github.com/EleutherAI/gpt-neox.git .;
diff --git a/learning_rates.py b/learning_rates.py
deleted file mode 100644
index 9d9f7edf..00000000
--- a/learning_rates.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch DataLoader for TFRecords"""
-
-from torch.optim.lr_scheduler import _LRScheduler
-import math
-
-class AnnealingLR(_LRScheduler):
-    """Anneals the learning rate from start to zero along a cosine curve."""
-
-    DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None']
-
-    def __init__(self, optimizer, start_lr, warmup_iter, num_iters, decay_style=None, last_iter=-1):
-        self.optimizer = optimizer
-        self.start_lr = start_lr
-        self.warmup_iter = warmup_iter
-        self.num_iters = last_iter + 1
-        self.end_iter = num_iters
-        self.decay_style = decay_style.lower() if isinstance(decay_style, str) else None
-        self.step(self.num_iters)
-        print('learning rate decaying', decay_style)
-
-    def get_lr(self):
-        # https://openreview.net/pdf?id=BJYwwY9ll pg. 4
-        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
-            return float(self.start_lr) * self.num_iters / self.warmup_iter
-        else:
-            if self.decay_style == self.DECAY_STYLES[0]:
-                return self.start_lr*((self.end_iter-(self.num_iters-self.warmup_iter))/self.end_iter)
-            elif self.decay_style == self.DECAY_STYLES[1]:
-                return self.start_lr / 2.0 * (math.cos(math.pi * (self.num_iters - self.warmup_iter) / self.end_iter) + 1)
-            elif self.decay_style == self.DECAY_STYLES[2]:
-                #TODO: implement exponential decay
-                return self.start_lr
-            else:
-                return self.start_lr
-
-    def step(self, step_num=None):
-        if step_num is None:
-            step_num = self.num_iters + 1
-        self.num_iters = step_num
-        new_lr = self.get_lr()
-        for group in self.optimizer.param_groups:
-            group['lr'] = new_lr
-
-    def state_dict(self):
-        sd = {
-                'start_lr': self.start_lr,
-                'warmup_iter': self.warmup_iter,
-                'num_iters': self.num_iters,
-                'decay_style': self.decay_style,
-                'end_iter': self.end_iter
-        }
-        return sd
-
-    def load_state_dict(self, sd):
-        self.start_lr = sd['start_lr']
-        self.warmup_iter = sd['warmup_iter']
-        self.num_iters = sd['num_iters']
-        self.end_iter = sd['end_iter']
-        self.decay_style = sd['decay_style']
-        self.step(self.num_iters)
diff --git a/megatron/__init__.py b/megatron/__init__.py
new file mode 100644
index 00000000..3f4fbdab
--- /dev/null
+++ b/megatron/__init__.py
@@ -0,0 +1,41 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .package_info import (
+    __description__,
+    __contact_names__,
+    __url__,
+    __download_url__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __version__,
+)
+
+from .global_vars import get_args
+from .global_vars import get_tokenizer
+from .global_vars import get_tensorboard_writer
+from .global_vars import get_adlr_autoresume
+from .global_vars import get_timers
+from .initialize  import initialize_megatron
+
+def print_rank_0(message):
+    """If distributed is initialized print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
diff --git a/megatron/arguments.py b/megatron/arguments.py
new file mode 100644
index 00000000..83a78256
--- /dev/null
+++ b/megatron/arguments.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron arguments."""
+
+import argparse
+import os
+
+import torch
+from megatron import fused_kernels
+
+import deepspeed
+
+def parse_args(extra_args_provider=None, defaults={},
+               ignore_unknown_args=False):
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
+                                     allow_abbrev=False)
+
+    # Standard arguments.
+    parser = _add_network_size_args(parser)
+    parser = _add_regularization_args(parser)
+    parser = _add_training_args(parser)
+    parser = _add_initialization_args(parser)
+    parser = _add_learning_rate_args(parser)
+    parser = _add_checkpointing_args(parser)
+    parser = _add_mixed_precision_args(parser)
+    parser = _add_distributed_args(parser)
+    parser = _add_validation_args(parser)
+    parser = _add_data_args(parser)
+    parser = _add_autoresume_args(parser)
+    parser = _add_realm_args(parser)
+    parser = _add_zero_args(parser)
+    parser = _add_activation_checkpoint_args(parser)
+
+    # Custom arguments.
+    if extra_args_provider is not None:
+        parser = extra_args_provider(parser)
+
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+
+    # Parse.
+    if ignore_unknown_args:
+        args, _ = parser.parse_known_args()
+    else:
+        args = parser.parse_args()
+
+    # Distributed args.
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+
+    # Fp16 loss scaling.
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+
+    # Parameters dtype.
+    args.params_dtype = torch.float
+    if args.fp16:
+        args.params_dtype = torch.half
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.params_dtype),
+              flush=True)
+
+
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
+
+    # Check required arguments.
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
+                     'max_position_embeddings']
+    for req_arg in required_args: 
+        _check_arg_is_not_none(args, req_arg)
+
+    # Checks.
+    assert args.hidden_size % args.num_attention_heads == 0
+    if args.seq_length is not None:
+        assert args.max_position_embeddings >= args.seq_length
+    if args.lr is not None:
+        assert args.min_lr <= args.lr
+    if args.save is not None:
+        assert args.save_interval is not None
+    # Parameters sharing does not work with torch DDP.
+    if (args.num_unique_layers is not None) and (args.num_layers is not None):
+        assert args.num_unique_layers <= args.num_layers
+        assert args.num_layers % args.num_unique_layers == 0, \
+            'num-layers should be divisible by num-unique-layers.'
+        if args.num_unique_layers < args.num_layers:
+            assert args.DDP_impl == 'local', \
+                'torch-DDP does not work with parameters sharing.'
+    # Mixed precision checks.
+    if args.fp16_lm_cross_entropy:
+        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    # Activation checkpointing.
+    if args.distribute_checkpointed_activations:
+        assert args.checkpoint_activations, \
+            'for distribute-checkpointed-activations to work you '\
+            'need to enable checkpoint-activations'
+
+    # load scaled_upper_triang_masked_softmax_fusion kernel
+    if args.scaled_upper_triang_masked_softmax_fusion:
+        fused_kernels.load_scaled_upper_triang_masked_softmax_fusion_kernel()
+
+    # load scaled_masked_softmax_fusion kernel
+    if args.scaled_masked_softmax_fusion:
+        fused_kernels.load_scaled_masked_softmax_fusion_kernel()
+
+    _print_args(args)
+    return args
+
+
+def _print_args(args):
+    """Print arguments."""
+    if args.rank == 0:
+        print('-------------------- arguments --------------------', flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = '.' * (32 - len(arg))
+            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print('---------------- end of arguments ----------------', flush=True)
+
+
+def _check_arg_is_not_none(args, arg):
+    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
+
+
+def _add_network_size_args(parser):
+    group = parser.add_argument_group(title='network size')
+
+    group.add_argument('--num-layers', type=int, default=None,
+                       help='Number of transformer layers.')
+    group.add_argument('--num-unique-layers', type=int, default=None,
+                       help='Number of unique transformer layers. '
+                       '`num-layers` should be divisible by this value.')
+    group.add_argument('--param-sharing-style', default='grouped',
+                       choices=['grouped', 'spaced'],
+                       help='Ordering of the shared parameters. For example, '
+                       'for a `num-layers`=4 and `--num-unique-layers`=2, '
+                       'we will have the following ordering for two unique '
+                       'layers 1 and 2: '
+                       '    grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].')
+    group.add_argument('--hidden-size', type=int, default=None,
+                       help='Tansformer hidden size.')
+    group.add_argument('--num-attention-heads', type=int, default=None,
+                       help='Number of transformer attention heads.')
+    group.add_argument('--max-position-embeddings', type=int, default=None,
+                       help='Maximum number of position embeddings to use. '
+                       'This is the size of position embedding.')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='Layer norm epsilon.')
+    group.add_argument('--apply-residual-connection-post-layernorm',
+                       action='store_true',
+                       help='If set, use original BERT residula connection '
+                       'ordering.')
+    group.add_argument('--openai-gelu', action='store_true',
+                       help='Use OpenAIs GeLU implementation. This option'
+                       'should not be used unless for backward compatibility'
+                       'reasons.')
+    group.add_argument('--onnx-safe', type=bool, required=False,
+                       help='Use workarounds for known problems with Torch ONNX exporter')
+
+    return parser
+
+
+def _add_regularization_args(parser):
+    group = parser.add_argument_group(title='regularization')
+
+    group.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='Post attention dropout ptobability.')
+    group.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='Dropout probability for hidden state transformer.')
+    group.add_argument('--weight-decay', type=float, default=0.01,
+                       help='Weight decay coefficient for L2 regularization.')
+    group.add_argument('--clip-grad', type=float, default=1.0,
+                       help='Gradient clipping based on global L2 norm.')
+    group.add_argument('--adam-beta1', type=float, default=0.9,
+                       help='First coefficient for computing running averages of'
+                       'gradient and its square')
+    group.add_argument('--adam-beta2', type=float, default=0.999,
+                       help='Second coefficient for computing running averages of'
+                       'gradient and its square')
+    group.add_argument('--adam-eps', type=float, default=1e-08,
+                       help='Term added to the denominator to improve'
+                       'numerical stability')
+
+    return parser
+
+
+def _add_training_args(parser):
+    group = parser.add_argument_group(title='training')
+
+    group.add_argument('--batch-size', type=int, default=None,
+                       help='Batch size per model instance (local batch size). '
+                       'Global batch size is local batch size times data '
+                       'parallel size.')
+    group.add_argument('--gas', type=int, default=1,
+                       help='Gradient accumulation steps (pipeline parallelism only). '
+                       'Global batch size is local batch size times data '
+                       'parallel size times gas.')
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='Checkpoint activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--distribute-checkpointed-activations',
+                       action='store_true',
+                       help='If set, distribute checkpointed activations '
+                       'across model parallel group.')
+    group.add_argument('--checkpoint-num-layers', type=int, default=1,
+                       help='chunk size (number of layers) for checkpointing.')
+    group.add_argument('--train-iters', type=int, default=None,
+                       help='Total number of iterations to train over all '
+                       'training runs.')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Report loss and timing interval.')
+    group.add_argument('--exit-interval', type=int, default=None,
+                       help='Exit the program after the iteration is divisible '
+                       'by this value.')
+    group.add_argument('--tensorboard-dir', type=str, default=None,
+                       help='Write TensorBoard logs to this directory.')
+    group.add_argument('--scaled-upper-triang-masked-softmax-fusion',
+                       action='store_true',
+                       help='Enable fusion of query_key_value_scaling '
+                       'time (upper diagonal) masking and softmax.')
+    group.add_argument('--scaled-masked-softmax-fusion',
+                       action='store_true',
+                       help='Enable fusion of query_key_value_scaling '
+                       'general masking and softmax.')
+    group.add_argument('--bias-gelu-fusion', action='store_true',
+                        help='Enable bias and gelu fusion.')
+    group.add_argument('--bias-dropout-fusion', action='store_true',
+                       help='Enable bias and dropout fusion.')
+
+    group.add_argument('--cpu-optimizer', action='store_true',
+                       help='Run optimizer on CPU')
+    group.add_argument('--cpu_torch_adam', action='store_true',
+                       help='Use Torch Adam as optimizer on CPU.')
+    return parser
+
+
+def _add_initialization_args(parser):
+    group = parser.add_argument_group(title='initialization')
+
+    group.add_argument('--seed', type=int, default=1234,
+                       help='Random seed used for python, numpy, '
+                       'pytorch, and cuda.')
+    group.add_argument('--init-method-std', type=float, default=0.02,
+                       help='Standard deviation of the zero mean normal '
+                       'distribution used for weight initialization.')
+
+    return parser
+
+
+def _add_learning_rate_args(parser):
+    group = parser.add_argument_group(title='learning rate')
+
+    group.add_argument('--lr', type=float, default=None,
+                       help='Initial learning rate. Depending on decay style '
+                       'and initial warmup, the learing rate at each '
+                       'iteration would be different.')
+    group.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine', 'exponential'],
+                       help='Learning rate decay function.')
+    group.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay learning rate over,'
+                       ' If None defaults to `--train-iters`')
+    group.add_argument('--min-lr', type=float, default=0.0,
+                       help='Minumum value for learning rate. The scheduler'
+                       'clip values below this threshold.')
+    group.add_argument('--warmup', type=float, default=0.01,
+                       help='Percentage of total iterations to warmup on '
+                       '(.01 = 1 percent of all training iters).')
+    group.add_argument('--override-lr-scheduler', action='store_true',
+                       help='Reset the values of the scheduler (learning rate,'
+                       'warmup iterations, minimum learning rate, maximum '
+                       'number of iterations, and decay style from input '
+                       'arguments and ignore values from checkpoints. Note'
+                       'that all the above values will be reset.')
+    group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
+                       help='Use checkpoint to set the values of the scheduler '
+                       '(learning rate, warmup iterations, minimum learning '
+                       'rate, maximum number of iterations, and decay style '
+                       'from checkpoint and ignore input arguments.')
+
+    return parser
+
+
+def _add_checkpointing_args(parser):
+    group = parser.add_argument_group(title='checkpointing')
+
+    group.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    group.add_argument('--save-interval', type=int, default=None,
+                       help='Number of iterations between checkpoint saves.')
+    group.add_argument('--no-save-optim', action='store_true',
+                       help='Do not save current optimizer.')
+    group.add_argument('--no-save-rng', action='store_true',
+                       help='Do not save current rng state.')
+    group.add_argument('--load', type=str, default=None,
+                       help='Directory containing a model checkpoint.')
+    group.add_argument('--no-load-optim', action='store_true',
+                       help='Do not load optimizer when loading checkpoint.')
+    group.add_argument('--no-load-rng', action='store_true',
+                       help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--finetune', action='store_true',
+                       help='Load model for finetuning. Do not load optimizer '
+                       'or rng state from checkpoint and set iteration to 0. '
+                       'Assumed when loading a release checkpoint.')
+
+    return parser
+
+
+def _add_mixed_precision_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
+
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode.')
+    group.add_argument('--apply-query-key-layer-scaling', action='store_true',
+                       help='Scale Q * K^T by 1 / layer-number. If this flag '
+                       'is set, then it will automatically set '
+                       'attention-softmax-in-fp32 to true')
+    group.add_argument('--attention-softmax-in-fp32', action='store_true',
+                       help='Run attention masking and softmax in fp32.')
+    group.add_argument('--fp32-allreduce', action='store_true',
+                       help='All-reduce in fp32')
+    group.add_argument('--hysteresis', type=int, default=2,
+                       help='hysteresis for dynamic loss scaling')
+    group.add_argument('--loss-scale', type=float, default=None,
+                       help='Static loss scaling, positive power of 2 '
+                       'values can improve fp16 convergence. If None, dynamic'
+                       'loss scaling is used.')
+    group.add_argument('--loss-scale-window', type=float, default=1000,
+                       help='Window over which to raise/lower dynamic scale.')
+    group.add_argument('--min-scale', type=float, default=1,
+                       help='Minimum loss scale for dynamic loss scale.')
+    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
+                       help='Move the cross entropy unreduced loss calculation'
+                       'for lm head to fp16.')
+
+
+    return parser
+
+
+def _add_distributed_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
+
+    group.add_argument('--model-parallel-size', type=int, default=1,
+                       help='Size of the model parallel.')
+    group.add_argument('--pipe-parallel-size', type=int, default=0,
+                       help='Size of the pipeline parallel. Disable with 0.')
+    group.add_argument('--distributed-backend', default='nccl',
+                       choices=['nccl', 'gloo'],
+                       help='Which backend to use for distributed training.')
+    group.add_argument('--DDP-impl', default='local',
+                       choices=['local', 'torch'],
+                       help='which DistributedDataParallel implementation '
+                       'to use.')
+    group.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher.')
+    group.add_argument('--lazy-mpu-init', type=bool, required=False,
+                       help='If set to True, initialize_megatron() skips DDP initialization'
+                       ' and returns function to complete it instead.'
+                       'Also turns on --use-cpu-initialization flag.'
+                       'This is for external DDP manager.' )
+    group.add_argument('--use-cpu-initialization', action='store_true',
+                       help='If set, affine parallel weights initialization uses CPU' )
+    return parser
+
+
+def _add_validation_args(parser):
+    group = parser.add_argument_group(title='validation')
+
+    group.add_argument('--eval-iters', type=int, default=100,
+                       help='Number of iterations to run for evaluation'
+                       'validation/test for.')
+    group.add_argument('--eval-interval', type=int, default=1000,
+                       help='Interval between running evaluation on '
+                       'validation set.')
+
+    return parser
+
+
+def _add_data_args(parser):
+    group = parser.add_argument_group(title='data and dataloader')
+
+    group.add_argument('--data-path', type=str, default=None,
+                       help='Path to combined dataset to split.')
+    group.add_argument('--split', type=str, default='969, 30, 1',
+                       help='Comma-separated list of proportions for training,'
+                       ' validation, and test split. For example the split '
+                       '`90,5,5` will use 90% of data for training, 5% for '
+                       'validation and 5% for test.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file.')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file.')
+    group.add_argument('--seq-length', type=int, default=None,
+                       help="Maximum sequence length to process.")
+    group.add_argument('--mask-prob', type=float, default=0.15,
+                       help='Probability of replacing a token with mask.')
+    group.add_argument('--short-seq-prob', type=float, default=0.1,
+                       help='Probability of producing a short sequence.')
+    group.add_argument('--mmap-warmup', action='store_true',
+                       help='Warm up mmap files.')
+    group.add_argument('--num-workers', type=int, default=2,
+                       help="Dataloader number of workers.")
+    group.add_argument('--tokenizer-type', type=str,
+                       default=None,
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--data-impl', type=str, default='infer',
+                       choices=['lazy', 'cached', 'mmap', 'infer'],
+                       help='Implementation of indexed datasets.')
+    group.add_argument('--reset-position-ids', action='store_true',
+                       help='Reset posistion ids after end-of-document token.')
+    group.add_argument('--reset-attention-mask', action='store_true',
+                       help='Reset self attention maske after '
+                       'end-of-document token.')
+    group.add_argument('--eod-mask-loss', action='store_true',
+                       help='Mask loss for the end of document tokens.')
+
+    return parser
+
+
+def _add_autoresume_args(parser):
+    group = parser.add_argument_group(title='autoresume')
+
+    group.add_argument('--adlr-autoresume', action='store_true',
+                       help='Enable autoresume on adlr cluster.')
+    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
+                       help='Intervals over which check for autoresume'
+                       'termination signal')
+
+    return parser
+
+
+def _add_realm_args(parser):
+    group = parser.add_argument_group(title='realm')
+
+    # network size
+    group.add_argument('--ict-head-size', type=int, default=None,
+                       help='Size of block embeddings to be used in ICT and REALM (paper default: 128)')
+
+    # checkpointing
+    group.add_argument('--ict-load', type=str, default=None,
+                       help='Directory containing an ICTBertModel checkpoint')
+    group.add_argument('--bert-load', type=str, default=None,
+                       help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)')
+
+    # data
+    group.add_argument('--titles-data-path', type=str, default=None,
+                       help='Path to titles dataset used for ICT')
+    group.add_argument('--query-in-block-prob', type=float, default=0.1,
+                       help='Probability of keeping query in block for ICT dataset')
+    group.add_argument('--use-one-sent-docs', action='store_true',
+                       help='Whether to use one sentence documents in ICT')
+
+    # training
+    group.add_argument('--report-topk-accuracies', nargs='+', default=[],
+                       help="Which top-k accuracies to report (e.g. '1 5 20')")
+
+    # faiss index
+    group.add_argument('--faiss-use-gpu', action='store_true',
+                       help='Whether create the FaissMIPSIndex on GPU')
+    group.add_argument('--block-data-path', type=str, default=None,
+                       help='Where to save/load BlockData to/from')
+
+    # indexer
+    group.add_argument('--indexer-batch-size', type=int, default=128,
+                       help='How large of batches to use when doing indexing jobs')
+    group.add_argument('--indexer-log-interval', type=int, default=1000,
+                       help='After how many batches should the indexer report progress')
+    return parser
+
+
+def _add_zero_args(parser):
+    """Text generate arguments."""
+
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument("--zero-stage", type=int, default=1.0)
+    group.add_argument('--zero-reduce-scatter', action='store_true',
+                       help='Use reduce scatter if specified')
+    group.add_argument('--zero-contigious-gradients', action='store_true',
+                       help='Use contigious memory optimizaiton if specified')
+    group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
+    group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
+    return parser
+
+
+def _add_activation_checkpoint_args(parser):
+    group = parser.add_argument_group('Activation Checkpointing',
+                                      'Checkpointing Configurations')
+    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
+                       help='uses activation checkpointing from deepspeed')
+    group.add_argument('--partition-activations', action='store_true',
+                       help='partition Activations across GPUs before checkpointing.')
+    group.add_argument('--contigious-checkpointing', action='store_true',
+                       help='Contigious memory checkpointing for activatoins.')
+    group.add_argument('--checkpoint-in-cpu', action='store_true',
+                       help='Move the activation checkpoints to CPU.')
+    group.add_argument('--synchronize-each-layer', action='store_true',
+                       help='does a synchronize at the beginning and end of each checkpointed layer.')
+    group.add_argument('--profile-backward', action='store_true',
+                       help='Enables backward pass profiling for checkpointed layers.')
+    return parser
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
new file mode 100644
index 00000000..80ee265c
--- /dev/null
+++ b/megatron/checkpointing.py
@@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input/output checkpointing."""
+
+import os
+import random
+import sys
+import numpy as np
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+
+from megatron import mpu, get_args
+from megatron import get_args
+from megatron import print_rank_0
+
+_CHECKPOINT_VERSION = None
+
+def set_checkpoint_version(value):
+    global _CHECKPOINT_VERSION
+    assert _CHECKPOINT_VERSION is None, \
+        "checkpoint version already set"
+    _CHECKPOINT_VERSION = value
+
+def get_checkpoint_version():
+    global _CHECKPOINT_VERSION
+    return _CHECKPOINT_VERSION
+
+def check_checkpoint_args(checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retreived frm checkpoint."""
+    args = get_args()
+
+    def _compare(arg_name):
+        checkpoint_value = getattr(checkpoint_args, arg_name)
+        args_value = getattr(args, arg_name)
+        error_message = '{} value from checkpoint ({}) is not equal to the ' \
+                        'input argument value ({}).'.format(
+                            arg_name, checkpoint_value, args_value)
+        assert checkpoint_value == args_value, error_message
+
+    _compare('num_layers')
+    _compare('hidden_size')
+    _compare('num_attention_heads')
+    _compare('max_position_embeddings')
+    _compare('make_vocab_size_divisible_by')
+    _compare('padded_vocab_size')
+    _compare('tokenizer_type')
+    _compare('model_parallel_size')
+
+
+def ensure_directory_exists(filename):
+    """Build filename's path if it does not already exists."""
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+
+def get_checkpoint_name(checkpoints_path, iteration,
+                        release=False, mp_rank=None):
+    """A unified checkpoint name."""
+    if release:
+        directory = 'release'
+    else:
+        directory = 'iter_{:07d}'.format(iteration)
+    return os.path.join(checkpoints_path, directory,
+                        'mp_rank_{:02d}'.format(
+                            mpu.get_model_parallel_rank() if mp_rank is None
+                            else mp_rank),
+                        'model_optim_rng.pt')
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+    """Tracker file rescords the latest chckpoint during
+    training to restart from."""
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def save_ds_checkpoint(iteration, model, args):
+    """Save a model checkpoint."""
+
+    sd = {}
+    sd['iteration'] = iteration
+    # rng states.
+    if not args.no_save_rng:
+        sd['random_rng_state'] = random.getstate()
+        sd['np_rng_state'] = np.random.get_state()
+        sd['torch_rng_state'] = torch.get_rng_state()
+        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
+
+    if args.pipe_parallel_size == 0:
+        #megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
+        #state_dict is used by deepspeed for module saving so it needs to point to the right function
+        model.module.state_dict = model.module.state_dict_for_save_checkpoint
+    else:
+        # Pipeline parallelism manages its own state_dict.
+        pass
+
+    model.save_checkpoint(args.save, client_state=sd)
+
+
+def save_checkpoint(iteration, model, optimizer, lr_scheduler):
+    """Save a model checkpoint."""
+    args = get_args()
+    
+    if args.deepspeed:
+        save_ds_checkpoint(iteration, model, args)
+    else:
+        # Only rank zero of the data parallel writes to the disk.
+        if isinstance(model, torchDDP):
+            model = model.module
+        if mpu.get_data_parallel_rank() == 0:
+
+            # Arguments, iteration, and model.
+            state_dict = {}
+            state_dict['args'] = args
+            state_dict['checkpoint_version'] = 2.0
+            state_dict['iteration'] = iteration
+            state_dict['model'] = model.state_dict_for_save_checkpoint()
+
+            # Optimizer stuff.
+            if not args.no_save_optim:
+                if optimizer is not None:
+                    state_dict['optimizer'] = optimizer.state_dict()
+                if lr_scheduler is not None:
+                    state_dict['lr_scheduler'] = lr_scheduler.state_dict()
+
+            # RNG states.
+            if not args.no_save_rng:
+                state_dict['random_rng_state'] = random.getstate()
+                state_dict['np_rng_state'] = np.random.get_state()
+                state_dict['torch_rng_state'] = torch.get_rng_state()
+                state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
+                state_dict['rng_tracker_states'] \
+                    = mpu.get_cuda_rng_tracker().get_states()
+
+            # Save.
+            checkpoint_name = get_checkpoint_name(args.save, iteration)
+            print('global rank {} is saving checkpoint at iteration {:7d} to {}'.
+                format(torch.distributed.get_rank(), iteration,
+                       checkpoint_name))
+            ensure_directory_exists(checkpoint_name)
+            torch.save(state_dict, checkpoint_name)
+            print('  successfully saved {}'.format(checkpoint_name))
+
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(str(iteration))
+    # Wait so everyone is done (not necessary)
+    torch.distributed.barrier()
+
+
+def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
+    """Load a model checkpoint and return the iteration."""
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    if isinstance(model, torchDDP):
+        model = model.module
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_dir)
+
+    # If no tracker file, return iretation zero.
+    if not os.path.isfile(tracker_filename):
+        print_rank_0('WARNING: could not find the metadata file {} '.format(
+            tracker_filename))
+        print_rank_0('    will not load any checkpoints and will start from '
+                     'random')
+        return 0
+
+    # Otherwise, read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration = 0
+    release = False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == 'release'
+            if not release:
+                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+                    tracker_filename))
+                sys.exit()
+
+    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+        tracker_filename)
+
+    if args.deepspeed:
+        checkpoint_name, state_dict = model.load_checkpoint(load_dir)
+
+        if checkpoint_name is None:
+            if mpu.get_data_parallel_rank() == 0:
+                print("Unable to load checkpoint.")
+            return iteration
+
+    else:
+        # Checkpoint.
+        checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        # Load the checkpoint.
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except ModuleNotFoundError:
+            # For backward compatibility.
+            print_rank_0(' > deserializing using the old code structure ...')
+            sys.modules['fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16.loss_scaler']
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+            sys.modules.pop('fp16.loss_scaler', None)
+        except BaseException:
+            print_rank_0('could not load the checkpoint')
+            sys.exit()
+            # Model.
+
+        model.load_state_dict(state_dict['model'])
+
+        # Optimizer.
+        if not release and not args.finetune and not args.no_load_optim:
+            try:
+                if optimizer is not None:
+                    optimizer.load_state_dict(state_dict['optimizer'])
+                if lr_scheduler is not None:
+                    lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
+            except KeyError:
+                print_rank_0(
+                    'Unable to load optimizer from checkpoint {}. '
+                    'Specify --no-load-optim or --finetune to prevent '
+                    'attempting to load the optimizer state, '
+                    'exiting ...'.format(checkpoint_name))
+                sys.exit()
+
+    # set checkpoint version
+    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
+
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict['total_iters']
+            except KeyError:
+                print_rank_0('A metadata file exists but unable to load '
+                             'iteration from checkpoint {}, exiting'.format(
+                                 checkpoint_name))
+                sys.exit()
+ 
+
+    # Check arguments.
+    if 'args' in state_dict:
+        checkpoint_args = state_dict['args']
+        check_checkpoint_args(checkpoint_args)
+    else:
+        print_rank_0('could not find arguments in the checkpoint ...')
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(state_dict['random_rng_state'])
+            np.random.set_state(state_dict['np_rng_state'])
+            torch.set_rng_state(state_dict['torch_rng_state'])
+            torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+            mpu.get_cuda_rng_tracker().set_states(
+                state_dict['rng_tracker_states'])
+        except KeyError:
+            print_rank_0('Unable to load optimizer from checkpoint {}. '
+                         'Specify --no-load-rng or --finetune to prevent '
+                         'attempting to load the optimizer state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+
+    torch.distributed.barrier()
+    if mpu.get_data_parallel_rank() == 0:
+        print('  successfully loaded {}'.format(checkpoint_name))
+
+    return iteration
+
+
+def load_ict_checkpoint(model, only_query_model=False, only_block_model=False, from_realm_chkpt=False):
+    """selectively load ICT models for indexing/retrieving from ICT or REALM checkpoints"""
+
+    args = get_args()
+
+    if isinstance(model, torchDDP):
+        model = model.module
+
+    load_path = args.load if from_realm_chkpt else args.ict_load
+
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    with open(tracker_filename, 'r') as f:
+        iteration = int(f.read().strip())
+
+    # assert iteration > 0
+    checkpoint_name = get_checkpoint_name(load_path, iteration, False)
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading checkpoint {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+
+    state_dict = torch.load(checkpoint_name, map_location='cpu')
+    ict_state_dict = state_dict['model']
+    if from_realm_chkpt and mpu.get_data_parallel_rank() == 0:
+        print(" loading ICT state dict from REALM", flush=True)
+        ict_state_dict = ict_state_dict['retriever']['ict_model']
+
+    if only_query_model:
+        ict_state_dict.pop('context_model')
+    if only_block_model:
+        ict_state_dict.pop('question_model')
+
+    model.load_state_dict(ict_state_dict)
+    torch.distributed.barrier()
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(' successfully loaded {}'.format(checkpoint_name))
+
+    return model
diff --git a/megatron/data/Makefile b/megatron/data/Makefile
new file mode 100644
index 00000000..8f9db768
--- /dev/null
+++ b/megatron/data/Makefile
@@ -0,0 +1,9 @@
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/megatron/data/__init__.py b/megatron/data/__init__.py
new file mode 100644
index 00000000..cd5f898c
--- /dev/null
+++ b/megatron/data/__init__.py
@@ -0,0 +1 @@
+from . import indexed_dataset
diff --git a/megatron/data/bert_dataset.py b/megatron/data/bert_dataset.py
new file mode 100644
index 00000000..5203666f
--- /dev/null
+++ b/megatron/data/bert_dataset.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT Style dataset."""
+
+import os
+import time
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+from megatron import get_tokenizer, get_args
+from megatron import print_rank_0
+from megatron import mpu
+from megatron.data.dataset_utils import get_a_and_b_segments
+from megatron.data.dataset_utils import truncate_segments
+from megatron.data.dataset_utils import create_tokens_and_tokentypes
+from megatron.data.dataset_utils import pad_and_convert_to_numpy
+from megatron.data.dataset_utils import create_masked_lm_predictions
+
+
+class BertDataset(Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, masked_lm_prob,
+                 max_seq_length, short_seq_prob, seed):
+
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping_(self.indexed_dataset,
+                                                    data_prefix,
+                                                    num_epochs,
+                                                    max_num_samples,
+                                                    self.max_seq_length,
+                                                    short_seq_prob,
+                                                    self.seed,
+                                                    self.name)
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        self.cls_id = tokenizer.cls
+        self.sep_id = tokenizer.sep
+        self.mask_id = tokenizer.mask
+        self.pad_id = tokenizer.pad
+
+    def __len__(self):
+        return self.samples_mapping.shape[0]
+
+    def __getitem__(self, idx):
+        start_idx, end_idx, seq_length = self.samples_mapping[idx]
+        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_id, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.masked_lm_prob, np_rng)
+
+
+def get_samples_mapping_(indexed_dataset,
+                         data_prefix,
+                         num_epochs,
+                         max_num_samples,
+                         max_seq_length,
+                         short_seq_prob,
+                         seed,
+                         name):
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
+    indexmap_filename += '_{}s'.format(seed)
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0 and \
+       not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+
+        # Make sure the types match the helpers input types.
+        assert indexed_dataset.doc_idx.dtype == np.int64
+        assert indexed_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building sapmles index mapping for {} ...'.format(
+            name))
+        # First compile and then import.
+        from megatron.data.dataset_utils import compile_helper
+        compile_helper()
+        from megatron.data import helpers
+        samples_mapping = helpers.build_mapping(
+            indexed_dataset.doc_idx,
+            indexed_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length - 3,  # account for added tokens
+            short_seq_prob,
+            seed,
+            verbose)
+        print_rank_0(' > done building sapmles index maping')
+        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elasped time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+                         time.time() - start_time))
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+    samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        samples_mapping.shape[0]))
+
+    return samples_mapping
+
+
+def build_training_sample(sample,
+                          target_seq_length, max_seq_length,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_id, sep_id, mask_id, pad_id,
+                          masked_lm_prob, np_rng):
+    """Biuld training sample.
+
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_id: Start of example id.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        masked_lm_prob: Probability to mask tokens.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+    """
+
+    # We assume that we have at least two sentences in the sample
+    assert len(sample) > 1
+    assert target_seq_length <= max_seq_length
+
+    # Divide sample into two segments (A and B).
+    tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, np_rng)
+
+    # Truncate to `target_sequence_length`.
+    max_num_tokens = target_seq_length
+    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
+                                  len(tokens_b), max_num_tokens, np_rng)
+
+    # Build tokens and toketypes.
+    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
+                                                      cls_id, sep_id)
+
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * max_num_tokens
+    (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
+
+    # Padding.
+    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
+        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                                   masked_labels, pad_id, max_seq_length)
+
+    train_sample = {
+        'text': tokens_np,
+        'types': tokentypes_np,
+        'labels': labels_np,
+        'is_random': int(is_next_random),
+        'loss_mask': loss_mask_np,
+        'padding_mask': padding_mask_np,
+        'truncated': int(truncated)}
+    return train_sample
+
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
new file mode 100644
index 00000000..d51b1ceb
--- /dev/null
+++ b/megatron/data/dataset_utils.py
@@ -0,0 +1,503 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Most of the code here has been copied from:
+#   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
+# with some modifications.
+
+import time
+import collections
+
+import numpy as np
+from megatron import get_args, print_rank_0
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+
+DSET_TYPE_STD = 'standard_bert'
+DSET_TYPE_ICT = 'ict'
+
+DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]
+
+
+def compile_helper():
+    """Compile helper function ar runtime. Make sure this
+    is invoked on a single process."""
+    import os
+    import subprocess
+    path = os.path.abspath(os.path.dirname(__file__))
+    ret = subprocess.run(['make', '-C', path])
+    if ret.returncode != 0:
+        print("Making C++ dataset helpers module failed, exiting.")
+        import sys
+        sys.exit(1)
+
+
+def get_a_and_b_segments(sample, np_rng):
+    """Divide sample into a and b segments."""
+
+    # Number of sentences in the sample.
+    n_sentences = len(sample)
+    # Make sure we always have two sentences.
+    assert n_sentences > 1, 'make sure each sample has at least two sentences.'
+
+    # First part:
+    # `a_end` is how many sentences go into the `A`.
+    a_end = 1
+    if n_sentences >= 3:
+        # Note that randin in numpy is exclusive.
+        a_end = np_rng.randint(1, n_sentences)
+    tokens_a = []
+    for j in range(a_end):
+        tokens_a.extend(sample[j])
+
+    # Second part:
+    tokens_b = []
+    for j in range(a_end, n_sentences):
+        tokens_b.extend(sample[j])
+
+    # Random next:
+    is_next_random = False
+    if np_rng.random() < 0.5:
+        is_next_random = True
+        tokens_a, tokens_b = tokens_b, tokens_a
+
+    return tokens_a, tokens_b, is_next_random
+
+
+def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    #print(len_a, len_b, max_num_tokens)
+    assert len_a > 0
+    assert len_b > 0
+    if len_a + len_b <= max_num_tokens:
+        return False
+    while len_a + len_b > max_num_tokens:
+        if len_a > len_b:
+            len_a -= 1
+            tokens = tokens_a
+        else:
+            len_b -= 1
+            tokens = tokens_b
+        if np_rng.random() < 0.5:
+            del tokens[0]
+        else:
+            tokens.pop()
+    return True
+
+
+def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
+    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
+
+    tokens = []
+    tokentypes = []
+    # [CLS].
+    tokens.append(cls_id)
+    tokentypes.append(0)
+    # Segment A.
+    for token in tokens_a:
+        tokens.append(token)
+        tokentypes.append(0)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(0)
+    # Segment B.
+    for token in tokens_b:
+        tokens.append(token)
+        tokentypes.append(1)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(1)
+
+    return tokens, tokentypes
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def is_start_piece(piece):
+    """Check if the current word piece is the starting piece (BERT)."""
+    # When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    return not piece.startswith("##")
+
+
+def create_masked_lm_predictions(tokens,
+                                 vocab_id_list, vocab_id_to_token_dict,
+                                 masked_lm_prob,
+                                 cls_id, sep_id, mask_id,
+                                 max_predictions_per_seq,
+                                 np_rng,
+                                 max_ngrams=3,
+                                 do_whole_word_mask=True,
+                                 favor_longer_ngram=False,
+                                 do_permutation=False):
+    """Creates the predictions for the masked LM objective.
+    Note: Tokens here are vocab ids and not text tokens."""
+
+    cand_indexes = []
+    # Note(mingdachen): We create a list for recording if the piece is
+    # the starting piece of current token, where 1 means true, so that
+    # on-the-fly whole word masking is possible.
+    token_boundary = [0] * len(tokens)
+
+    for (i, token) in enumerate(tokens):
+        if token == cls_id or token == sep_id:
+            token_boundary[i] = 1
+            continue
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if (do_whole_word_mask and len(cand_indexes) >= 1 and
+                not is_start_piece(vocab_id_to_token_dict[token])):
+            cand_indexes[-1].append(i)
+        else:
+            cand_indexes.append([i])
+            if is_start_piece(vocab_id_to_token_dict[token]):
+                token_boundary[i] = 1
+
+    output_tokens = list(tokens)
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+
+    if masked_lm_prob == 0:
+        return (output_tokens, masked_lm_positions,
+                masked_lm_labels, token_boundary)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    # Note(mingdachen):
+    # By default, we set the probilities to favor shorter ngram sequences.
+    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
+    pvals = 1. / np.arange(1, max_ngrams + 1)
+    pvals /= pvals.sum(keepdims=True)
+
+    if favor_longer_ngram:
+        pvals = pvals[::-1]
+
+    ngram_indexes = []
+    for idx in range(len(cand_indexes)):
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+        ngram_indexes.append(ngram_index)
+
+    np_rng.shuffle(ngram_indexes)
+
+    masked_lms = []
+    covered_indexes = set()
+    for cand_index_set in ngram_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if not cand_index_set:
+            continue
+        # Note(mingdachen):
+        # Skip current piece if they are covered in lm masking or previous ngrams.
+        for index_set in cand_index_set[0]:
+            for index in index_set:
+                if index in covered_indexes:
+                    continue
+
+        n = np_rng.choice(ngrams[:len(cand_index_set)],
+                          p=pvals[:len(cand_index_set)] /
+                          pvals[:len(cand_index_set)].sum(keepdims=True))
+        index_set = sum(cand_index_set[n - 1], [])
+        n -= 1
+        # Note(mingdachen):
+        # Repeatedly looking for a candidate that does not exceed the
+        # maximum number of predictions by trying shorter ngrams.
+        while len(masked_lms) + len(index_set) > num_to_predict:
+            if n == 0:
+                break
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_predict:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+
+            masked_token = None
+            # 80% of the time, replace with [MASK]
+            if np_rng.random() < 0.8:
+                masked_token = mask_id
+            else:
+                # 10% of the time, keep original
+                if np_rng.random() < 0.5:
+                    masked_token = tokens[index]
+                # 10% of the time, replace with random word
+                else:
+                    masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
+
+            output_tokens[index] = masked_token
+
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+    assert len(masked_lms) <= num_to_predict
+
+    np_rng.shuffle(ngram_indexes)
+
+    select_indexes = set()
+    if do_permutation:
+        for cand_index_set in ngram_indexes:
+            if len(select_indexes) >= num_to_predict:
+                break
+            if not cand_index_set:
+                continue
+            # Note(mingdachen):
+            # Skip current piece if they are covered in lm masking or previous ngrams.
+            for index_set in cand_index_set[0]:
+                for index in index_set:
+                    if index in covered_indexes or index in select_indexes:
+                        continue
+
+            n = np.random.choice(ngrams[:len(cand_index_set)],
+                                 p=pvals[:len(cand_index_set)] /
+                                 pvals[:len(cand_index_set)].sum(keepdims=True))
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+
+            while len(select_indexes) + len(index_set) > num_to_predict:
+                if n == 0:
+                    break
+                index_set = sum(cand_index_set[n - 1], [])
+                n -= 1
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(select_indexes) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                if index in covered_indexes or index in select_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                select_indexes.add(index)
+        assert len(select_indexes) <= num_to_predict
+
+        select_indexes = sorted(select_indexes)
+        permute_indexes = list(select_indexes)
+        np_rng.shuffle(permute_indexes)
+        orig_token = list(output_tokens)
+
+        for src_i, tgt_i in zip(select_indexes, permute_indexes):
+            output_tokens[src_i] = orig_token[tgt_i]
+            masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+
+    return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
+
+
+def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                             masked_labels, pad_id, max_seq_length):
+    """Pad sequences and convert them to numpy."""
+
+    # Some checks.
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+    assert len(tokentypes) == num_tokens
+    assert len(masked_positions) == len(masked_labels)
+
+    # Tokens and token types.
+    filler = [pad_id] * padding_length
+    tokens_np = np.array(tokens + filler, dtype=np.int64)
+    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+    # Padding mask.
+    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                               dtype=np.int64)
+
+    # Lables and loss mask.
+    labels = [-1] * max_seq_length
+    loss_mask = [0] * max_seq_length
+    for i in range(len(masked_positions)):
+        assert masked_positions[i] < num_tokens
+        labels[masked_positions[i]] = masked_labels[i]
+        loss_mask[masked_positions[i]] = 1
+    labels_np = np.array(labels, dtype=np.int64)
+    loss_mask_np = np.array(loss_mask, dtype=np.int64)
+
+    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
+
+
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    max_seq_length, masked_lm_prob,
+                                    short_seq_prob, seed, skip_warmup,
+                                    dataset_type='standard_bert'):
+
+    if dataset_type not in DSET_TYPES:
+        raise ValueError("Invalid dataset_type: ", dataset_type)
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    if dataset_type == DSET_TYPE_ICT:
+        args = get_args()
+        title_dataset = get_indexed_dataset_(args.titles_data_path,
+                                             data_impl,
+                                             skip_warmup)
+
+    # Get start and end indices of train/valid/train into doc-idx
+    # Note that doc-idx is desinged to be num-docs + 1 so we can
+    # easily iterate over it.
+    total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+        start_index = indexed_dataset.doc_idx[splits[index]]
+        end_index = indexed_dataset.doc_idx[splits[index + 1]]
+        print_rank_0('     sentence indices in [{}, {}) total of {} '
+                     'sentences'.format(start_index, end_index,
+                                        end_index - start_index))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        from megatron.data.bert_dataset import BertDataset
+        from megatron.data.ict_dataset import ICTDataset
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            # Get the pointer to the original doc-idx so we can set it later.
+            doc_idx_ptr = indexed_dataset.get_doc_idx()
+            # Slice the doc-idx
+            start_index = splits[index]
+            # Add +1 so we can index into the dataset to get the upper bound.
+            end_index = splits[index + 1] + 1
+            # New doc_idx view.
+            indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
+            # Build the dataset accordingly.
+            kwargs = dict(
+                name=name,
+                data_prefix=data_prefix,
+                num_epochs=None,
+                max_num_samples=train_valid_test_num_samples[index],
+                max_seq_length=max_seq_length,
+                seed=seed
+            )
+
+            if dataset_type == DSET_TYPE_ICT:
+                args = get_args()
+                dataset = ICTDataset(
+                    block_dataset=indexed_dataset,
+                    title_dataset=title_dataset,
+                    query_in_block_prob=args.query_in_block_prob,
+                    use_one_sent_docs=args.use_one_sent_docs,
+                    **kwargs
+                )
+            else:
+                dataset = BertDataset(
+                    indexed_dataset=indexed_dataset,
+                    masked_lm_prob=masked_lm_prob,
+                    short_seq_prob=short_seq_prob,
+                    **kwargs
+                )
+
+            # Set the original pointer so dataset remains the main dataset.
+            indexed_dataset.set_doc_idx(doc_idx_ptr)
+            # Checks.
+            assert indexed_dataset.doc_idx[0] == 0
+            assert indexed_dataset.doc_idx.shape[0] == \
+                (total_num_of_documents + 1)
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+
+    print_rank_0(' > building dataset index ...')
+
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+
+    print_rank_0(' > indexed dataset stats:')
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.doc_idx.shape[0] - 1))
+    print_rank_0('    number of sentences: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """ Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(',') != -1:
+        splits = [float(s) for s in splits_string.split(',')]
+    elif splits_string.find('/') != -1:
+        splits = [float(s) for s in splits_string.split('/')]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] +
+                            int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+
+
diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
new file mode 100644
index 00000000..3aa7b705
--- /dev/null
+++ b/megatron/data/gpt2_dataset.py
@@ -0,0 +1,317 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT2 style dataset."""
+
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import mpu, print_rank_0
+from megatron.data.dataset_utils import get_train_valid_test_split_
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+
+
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    seq_length, seed, skip_warmup):
+    """Build train, valid, and test datasets."""
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            documents = np.arange(start=splits[index], stop=splits[index + 1],
+                                  step=1, dtype=np.int32)
+            dataset = GPT2Dataset(name, data_prefix,
+                                  documents, indexed_dataset,
+                                  train_valid_test_num_samples[index],
+                                  seq_length, seed)
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+    """Build indexed dataset."""
+    print_rank_0(' > building dataset index ...')
+
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+class GPT2Dataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, data_prefix, documents, indexed_dataset,
+                 num_samples, seq_length, seed):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+        # Build index mappings.
+        self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+            self.name, data_prefix, documents, self.indexed_dataset.sizes,
+            num_samples, seq_length, seed)
+
+    def __len__(self):
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        return self.sample_idx.shape[0] - 1
+
+    def __getitem__(self, idx):
+        # Get the shuffled index.
+        idx = self.shuffle_idx[idx]
+        # Start and end documents and offsets.
+        doc_index_f = self.sample_idx[idx][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
+        offset_f = self.sample_idx[idx][1]
+        offset_l = self.sample_idx[idx + 1][1]
+        # If we are within the same document, just extract the chunk.
+        if doc_index_f == doc_index_l:
+            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                              offset=offset_f,
+                                              length=offset_l - offset_f + 1)
+        else:
+            # Otherwise, get the rest of the initial document.
+            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                                    offset=offset_f)]
+            # Loop over all in between documents and add the entire document.
+            for i in range(doc_index_f + 1, doc_index_l):
+                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
+            # And finally add the relevant portion of last document.
+            sample_list.append(self.indexed_dataset.get(
+                self.doc_idx[doc_index_l],
+                length=offset_l + 1))
+            sample = np.concatenate(sample_list)
+
+        return {'text': np.array(sample, dtype=np.int64)}
+
+
+def _build_index_mappings(name, data_prefix, documents, sizes,
+                          num_samples, seq_length, seed):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    _filename = data_prefix
+    _filename += '_{}_indexmap'.format(name)
+    _filename += '_{}ns'.format(num_samples)
+    _filename += '_{}sl'.format(seq_length)
+    _filename += '_{}s'.format(seed)
+    doc_idx_filename = _filename + '_doc_idx.npy'
+    sample_idx_filename = _filename + '_sample_idx.npy'
+    shuffle_idx_filename = _filename + '_shuffle_idx.npy'
+
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0:
+        if (not os.path.isfile(doc_idx_filename)) or \
+           (not os.path.isfile(sample_idx_filename)) or \
+           (not os.path.isfile(shuffle_idx_filename)):
+
+            print_rank_0(' > WARNING: could not find index map files, building '
+                         'the indices on rank 0 ...')
+            # doc-idx.
+            start_time = time.time()
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
+            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save doc-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
+            # sample-idx.
+            start_time = time.time()
+            # Use C++ implementation for speed.
+            # First compile and then import.
+            from megatron.data.dataset_utils import compile_helper
+            compile_helper()
+            from megatron.data import helpers
+            assert doc_idx.dtype == np.int32
+            assert sizes.dtype == np.int32
+            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
+                                                  num_epochs, tokens_per_epoch)
+            # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
+            #                               num_epochs, tokens_per_epoch)
+            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save sample-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
+            # shuffle-idx.
+            start_time = time.time()
+            # -1 is due to data structure used to retieve the index:
+            #    sample i --> [sample_idx[i], sample_idx[i+1])
+            shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
+            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
+                         ' (seconds): {:4f}'.format(time.time() - start_time))
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_io_parallel_group())
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(' > loading doc-idx mapping from {}'.format(
+        doc_idx_filename))
+    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0(' > loading sample-idx mapping from {}'.format(
+        sample_idx_filename))
+    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0(' > loading shuffle-idx mapping from {}'.format(
+        shuffle_idx_filename))
+    shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        sample_idx.shape[0]))
+    print_rank_0('    total number of epochs: {}'.format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence lenght, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng):
+    """Build an array with length = number-of-epochs * number-of-dcuments.
+    Each index is mapped to a corresponding document."""
+    doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
+    doc_idx[:] = documents
+    doc_idx = doc_idx.reshape(-1)
+    doc_idx = doc_idx.astype(np.int32)
+    np_rng.shuffle(doc_idx)
+    return doc_idx
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length,
+                      num_epochs, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Begining offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += (remaining_seq_length + doc_length - 1)
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the begining of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    dtype_ = np.uint32
+    if size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+    shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx)
+    return shuffle_idx
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
new file mode 100644
index 00000000..ca903296
--- /dev/null
+++ b/megatron/data/helpers.cpp
@@ -0,0 +1,643 @@
+/*
+ coding=utf-8
+ Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+
+/* Helper methods for fast index mapping builds */
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <random>
+
+namespace py = pybind11;
+using namespace std;
+
+const int32_t LONG_SENTENCE_LEN = 512;
+
+
+py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
+			   const py::array_t<int32_t>& doc_idx_,
+			   const int32_t seq_length,
+			   const int32_t num_epochs,
+			   const int64_t tokens_per_epoch) {
+    /* Sample index (sample_idx) is used for gpt2 like dataset for which
+       the documents are flattened and the samples are built based on this
+       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+       starting offset in that document.*/
+
+    // Consistency checks.
+    assert(seq_length > 1);
+    assert(num_epochs > 0);
+    assert(tokens_per_epoch > 1);
+
+    // Remove bound checks.
+    auto sizes = sizes_.unchecked<1>();
+    auto doc_idx = doc_idx_.unchecked<1>();
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
+    int32_t* sample_idx = new int32_t[2*(num_samples+1)];
+
+    cout << "    using:" << endl << std::flush;
+    cout << "     number of documents:       " <<
+      doc_idx_.shape(0) / num_epochs << endl << std::flush;
+    cout << "     number of epochs:          " << num_epochs <<
+      endl << std::flush;
+    cout << "     sequence length:           " << seq_length <<
+      endl << std::flush;
+    cout << "     total number of samples:   " << num_samples <<
+      endl << std::flush;
+
+    // Index into sample_idx.
+    int64_t sample_index = 0;
+    // Index into doc_idx.
+    int64_t doc_idx_index = 0;
+    // Begining offset for each document.
+    int32_t doc_offset = 0;
+    // Start with first document and no offset.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+
+    while (sample_index <= num_samples) {
+        // Start with a fresh sequence.
+      int32_t remaining_seq_length = seq_length + 1;
+      while (remaining_seq_length != 0) {
+            // Get the document length.
+	auto doc_id = doc_idx[doc_idx_index];
+	auto doc_length = sizes[doc_id] - doc_offset;
+	// And add it to the current sequence.
+	remaining_seq_length -= doc_length;
+	// If we have more than a full sequence, adjust offset and set
+	// remaining length to zero so we return from the while loop.
+	// Note that -1 here is for the same reason we have -1 in
+	// `_num_epochs` calculations.
+	if (remaining_seq_length <= 0) {
+	  doc_offset += (remaining_seq_length + doc_length - 1);
+	  remaining_seq_length = 0;
+	} else {
+	  // Otherwise, start from the begining of the next document.
+	  ++doc_idx_index;
+	  doc_offset = 0;
+	}
+      }
+      // Record the sequence.
+      sample_idx[2 * sample_index] = doc_idx_index;
+      sample_idx[2 * sample_index + 1] = doc_offset;
+      ++sample_index;
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(sample_idx, [](void *mem_) {
+	int32_t *mem = reinterpret_cast<int32_t*>(mem_);
+	delete[] mem;
+      });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(int32_t);
+    return py::array(std::vector<int64_t>{num_samples+1, 2}, // shape
+                     {2*byte_size, byte_size}, // C-style contiguous strides
+                     sample_idx, // the data pointer
+                     free_when_done); // numpy array references
+    
+}
+
+
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+				     const int32_t max_length,
+				     std::mt19937& rand32_gen) {
+    /* Training sample length. */
+    const auto random_number = rand32_gen();
+    if ((random_number % short_seq_ratio) == 0) {
+      return 2 + random_number % (max_length - 1);
+    }
+    return max_length;
+}
+
+
+template<typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
+                             const py::array_t<int32_t>& sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+			     const bool verbose) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(short_seq_prob > 0.0);
+    assert(short_seq_prob <= 1.0);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+
+    // For efficiency, convert probability to ratio. Note: rand() generates int.
+    const auto short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+	const auto sent_end_index = docs[docs_.shape(0) - 1];
+	const auto num_sentences = sent_end_index - sent_start_index;
+	cout << "    using:" << endl << std::flush;
+	cout << "     number of documents:            " << docs_.shape(0) - 1 <<
+	  endl << std::flush;
+	cout << "     sentences range:                [" << sent_start_index <<
+	", " << sent_end_index << ")" << endl << std::flush;
+	cout << "     total number of sentences:      " << num_sentences <<
+	  endl << std::flush;
+	cout << "     number of epochs:               " << num_epochs <<
+	  endl << std::flush;
+	cout << "     maximum number of samples:      " << max_num_samples <<
+	  endl << std::flush;
+	cout << "     maximum sequence length:        " << max_seq_length <<
+	  endl << std::flush;
+	cout << "     short sequence probability:     " << short_seq_prob <<
+	endl << std::flush;
+	cout << "     short sequence ration (1/prob): " << short_seq_ratio <<
+	  endl << std::flush;
+	cout << "     seed:                           " << seed << endl <<
+	  std::flush;
+    }
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration=0; iteration<2; ++iteration) {
+
+        // Set the seed so both iterations produce the same results.
+        std::mt19937 rand32_gen(seed);
+
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Counters:
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+	uint64_t long_sent_docs = 0;
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        // For each epoch:
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            if (map_index >= max_num_samples) {
+	        if (verbose && (!second)) {
+		  cout << "    reached " << max_num_samples << " samples after "
+		       << epoch << " epochs ..." << endl << std::flush;
+		}
+                break;
+            }
+            // For each document:
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
+
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+
+                // At the begining of the document previous index is the
+		// start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) {
+		        ++empty_docs;
+                    }
+                    if (num_remain_sent == 1) {
+		        ++one_sent_docs;
+                    }
+                }
+
+		// Detect documents with long sentences.
+		bool contains_long_sentence = false;
+		if (num_remain_sent > 1) {
+		    for (auto sent_index=sent_index_first;
+			 sent_index < sent_index_last; ++sent_index) {
+		        if (sizes[sent_index] > LONG_SENTENCE_LEN){
+			    if ((epoch == 0) && (!second)) {
+			        ++long_sent_docs;
+			    }
+			    contains_long_sentence = true;
+			    break;
+			}
+		    }
+		}
+
+                // If we have more than two sentences.
+                if ((num_remain_sent > 1) && (!contains_long_sentence)) {
+
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+                    auto target_seq_len = get_target_sample_len(short_seq_ratio,
+								max_seq_length,
+								rand32_gen);
+
+                    // Loop through sentences.
+                    for (auto sent_index=sent_index_first;
+                         sent_index < sent_index_last; ++sent_index) {
+
+		        // Add the size and number of sentences.
+		        seq_len += sizes[sent_index];
+		        ++num_sent;
+			--num_remain_sent;
+
+			// If we have reached the target length.
+			// and if not only one sentence is left in the document.
+			// and if we have at least two sentneces.
+			// and if we have reached end of the document.
+			if (((seq_len >= target_seq_len) &&
+			     (num_remain_sent > 1) &&
+			     (num_sent > 1) ) || (num_remain_sent == 0)) {
+
+			    // Check for overflow.
+			    if ((3 * map_index + 2) >
+				std::numeric_limits<int64_t>::max()) {
+			        cout << "number of samples exceeded maximum "
+				     << "allowed by type int64: "
+				     << std::numeric_limits<int64_t>::max()
+				     << endl;
+				throw std::overflow_error("Number of samples");
+			    }
+
+			    // Populate the map.
+			    if (second) {
+			        const auto map_index_0 = 3 * map_index;
+				maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+				maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+				maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+			    }
+
+			    // Update indices / counters.
+			    ++map_index;
+			    prev_start_index = sent_index + 1;
+			    target_seq_len = get_target_sample_len(short_seq_ratio,
+								   max_seq_length,
+								   rand32_gen);
+			    seq_len = 0;
+			    num_sent = 0;
+			}
+
+                    } // for (auto sent_index=sent_index_first; ...
+                } // if (num_remain_sent > 1) {
+            } // for (int doc=0; doc < num_docs; ++doc) {
+        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+	    if (verbose) {
+	        cout << "   number of empty documents: " << empty_docs <<
+		  endl << std::flush;
+		cout << "   number of documents with one sentence: " <<
+		  one_sent_docs << endl << std::flush;
+		cout << "   number of documents with long sentences: " <<
+		  long_sent_docs << endl << std::flush;
+		cout << "   will create mapping for " << map_index <<
+		  " samples" << endl << std::flush;
+	    }
+	    assert(maps == NULL);
+	    assert(num_samples < 0);
+            maps = new DocIdx[3*map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    } // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i=(num_samples - 1); i > 0; --i) {
+      const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+      const auto i0 = 3 * i;
+      const auto j0 = 3 * j;
+      // Swap values.
+      swap(maps[i0], maps[j0]);
+      swap(maps[i0 + 1], maps[j0 + 1]);
+      swap(maps[i0 + 2], maps[j0 + 2]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void *mem_) {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem;
+        });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                     {3*byte_size, byte_size}, // C-style contiguous strides
+                     maps, // the data pointer
+                     free_when_done); // numpy array references
+
+}
+
+
+py::array build_mapping(const py::array_t<int64_t>& docs_,
+                        const py::array_t<int>& sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+			const bool verbose) {
+
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) {
+	   cout << "    using uint64 for data mapping..." << endl << std::flush;
+	}
+	return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+					    max_num_samples, max_seq_length,
+					    short_seq_prob, seed, verbose);
+    } else {
+       if (verbose) {
+	   cout << "    using uint32 for data mapping..." << endl << std::flush;
+       }
+       return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+					   max_num_samples, max_seq_length,
+					   short_seq_prob, seed, verbose);
+    }
+}
+
+template<typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
+                                    const py::array_t<int32_t>& sizes_,
+                                    const py::array_t<int32_t>& titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose,
+                                    const bool use_one_sent_blocks) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+    auto titles_sizes = titles_sizes_.unchecked<1>();
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+        const auto sent_end_index = docs[docs_.shape(0) - 1];
+        const auto num_sentences = sent_end_index - sent_start_index;
+        cout << "    using:" << endl << std::flush;
+        cout << "     number of documents:            " << docs_.shape(0) - 1 <<
+          endl << std::flush;
+        cout << "     sentences range:                [" << sent_start_index <<
+        ", " << sent_end_index << ")" << endl << std::flush;
+        cout << "     total number of sentences:      " << num_sentences <<
+          endl << std::flush;
+        cout << "     number of epochs:               " << num_epochs <<
+          endl << std::flush;
+        cout << "     maximum number of samples:      " << max_num_samples <<
+          endl << std::flush;
+        cout << "     maximum sequence length:        " << max_seq_length <<
+          endl << std::flush;
+        cout << "     seed:                           " << seed << endl <<
+          std::flush;
+    }
+
+    // Mapping and its length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Acceptable number of sentences per block.
+    int min_num_sent = 2;
+    if (use_one_sent_blocks) {
+        min_num_sent = 1;
+    }
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration=0; iteration<2; ++iteration) {
+
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+        uint64_t long_sent_docs = 0;
+        // For each epoch:
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            // assign every block a unique id
+            int32_t block_id = 0;
+
+            if (map_index >= max_num_samples) {
+                if (verbose && (!second)) {
+                cout << "    reached " << max_num_samples << " samples after "
+                     << epoch << " epochs ..." << endl << std::flush;
+                }
+                break;
+            }
+            // For each document:
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
+
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+                const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+                // At the begining of the document previous index is the
+                // start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) {
+		                ++empty_docs;
+                    }
+                    if (num_remain_sent == 1) {
+		                ++one_sent_docs;
+                    }
+                }
+                // Detect documents with long sentences.
+                bool contains_long_sentence = false;
+                if (num_remain_sent >= min_num_sent) {
+                    for (auto sent_index=sent_index_first;
+                    sent_index < sent_index_last; ++sent_index) {
+                        if (sizes[sent_index] > LONG_SENTENCE_LEN){
+                            if ((epoch == 0) && (!second)) {
+                                ++long_sent_docs;
+                            }
+                            contains_long_sentence = true;
+                            break;
+                        }
+                    }
+                }
+                // If we have enough sentences and no long sentences.
+                if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
+
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+
+                    // Loop through sentences.
+                    for (auto sent_index=sent_index_first;
+                         sent_index < sent_index_last; ++sent_index) {
+
+                            // Add the size and number of sentences.
+                            seq_len += sizes[sent_index];
+                            ++num_sent;
+                            --num_remain_sent;
+
+                        // If we have reached the target length.
+                        // and there are an acceptable number of sentences left
+                        // and if we have at least the minimum number of sentences.
+                        // or if we have reached end of the document.
+                        if (((seq_len >= target_seq_len) &&
+                             (num_remain_sent >= min_num_sent) &&
+                             (num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
+
+                            // Populate the map.
+                            if (second) {
+                                const auto map_index_0 = 4 * map_index;
+                                // Each sample has 4 items: the starting sentence index, ending sentence index,
+                                // the index of the document from which the block comes (used for fetching titles)
+                                // and the unique id of the block (used for creating block indexes)
+
+                                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
+                            }
+
+                            // Update indices / counters.
+                            ++map_index;
+                            ++block_id;
+                            prev_start_index = sent_index + 1;
+                            seq_len = 0;
+                            num_sent = 0;
+                        }
+                    } // for (auto sent_index=sent_index_first; ...
+                } // if (num_remain_sent > 1) {
+            } // for (int doc=0; doc < num_docs; ++doc) {
+        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+            if (verbose) {
+	        cout << "   number of empty documents: " << empty_docs <<
+              endl << std::flush;
+            cout << "   number of documents with one sentence: " <<
+              one_sent_docs << endl << std::flush;
+            cout << "   number of documents with long sentences: " <<
+              long_sent_docs << endl << std::flush;
+            cout << "   will create mapping for " << map_index <<
+              " samples" << endl << std::flush;
+            }
+            assert(maps == NULL);
+            assert(num_samples < 0);
+            maps = new DocIdx[4*map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    } // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i=(num_samples - 1); i > 0; --i) {
+        const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+        const auto i0 = 4 * i;
+        const auto j0 = 4 * j;
+        // Swap values.
+        swap(maps[i0], maps[j0]);
+        swap(maps[i0 + 1], maps[j0 + 1]);
+        swap(maps[i0 + 2], maps[j0 + 2]);
+        swap(maps[i0 + 3], maps[j0 + 3]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void *mem_) {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem;
+        });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 4}, // shape
+                     {4*byte_size, byte_size}, // C-style contiguous strides
+                     maps, // the data pointer
+                     free_when_done); // numpy array references
+
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
+                               const py::array_t<int>& sizes_,
+                               const py::array_t<int>& titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                    const bool verbose,
+                    const bool use_one_sent_blocks) {
+
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) {
+	   cout << "    using uint64 for data mapping..." << endl << std::flush;
+	}
+	return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
+	                    num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+    } else {
+       if (verbose) {
+	   cout << "    using uint32 for data mapping..." << endl << std::flush;
+       }
+       return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
+                        num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+    }
+}
+
+PYBIND11_MODULE(helpers, m) {
+    m.def("build_mapping", &build_mapping);
+    m.def("build_blocks_mapping", &build_blocks_mapping);
+    m.def("build_sample_idx", &build_sample_idx);
+}
diff --git a/megatron/data/ict_dataset.py b/megatron/data/ict_dataset.py
new file mode 100644
index 00000000..71916d64
--- /dev/null
+++ b/megatron/data/ict_dataset.py
@@ -0,0 +1,140 @@
+import itertools
+import random
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from megatron import get_tokenizer
+from megatron import get_args
+from megatron.data.dataset_utils import get_indexed_dataset_
+from megatron.data.realm_dataset_utils import get_block_samples_mapping
+
+
+def get_ict_dataset(use_titles=True, query_in_block_prob=1):
+    """Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
+    rather than for training, since it is only built with a single epoch sample mapping.
+    """
+    args = get_args()
+    block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True)
+    titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True)
+
+    kwargs = dict(
+        name='full',
+        block_dataset=block_dataset,
+        title_dataset=titles_dataset,
+        data_prefix=args.data_path,
+        num_epochs=1,
+        max_num_samples=None,
+        max_seq_length=args.seq_length,
+        seed=1,
+        query_in_block_prob=query_in_block_prob,
+        use_titles=use_titles,
+        use_one_sent_docs=args.use_one_sent_docs
+    )
+    dataset = ICTDataset(**kwargs)
+    return dataset
+
+
+class ICTDataset(Dataset):
+    """Dataset containing sentences and their blocks for an inverse cloze task."""
+    def __init__(self, name, block_dataset, title_dataset, data_prefix,
+                 num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
+                 seed, use_titles=True, use_one_sent_docs=False):
+        self.name = name
+        self.seed = seed
+        self.max_seq_length = max_seq_length
+        self.query_in_block_prob = query_in_block_prob
+        self.block_dataset = block_dataset
+        self.title_dataset = title_dataset
+        self.rng = random.Random(self.seed)
+        self.use_titles = use_titles
+        self.use_one_sent_docs = use_one_sent_docs
+
+        self.samples_mapping = get_block_samples_mapping(
+            block_dataset, title_dataset, data_prefix, num_epochs,
+            max_num_samples, max_seq_length, seed, name, use_one_sent_docs)
+        self.tokenizer = get_tokenizer()
+        self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_list = self.tokenizer.inv_vocab
+        self.cls_id = self.tokenizer.cls
+        self.sep_id = self.tokenizer.sep
+        self.mask_id = self.tokenizer.mask
+        self.pad_id = self.tokenizer.pad
+
+    def __len__(self):
+        return len(self.samples_mapping)
+
+    def __getitem__(self, idx):
+        """Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
+        sample_data = self.samples_mapping[idx]
+        start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple()
+
+        if self.use_titles:
+            title = self.title_dataset[int(doc_idx)]
+            title_pad_offset = 3 + len(title)
+        else:
+            title = None
+            title_pad_offset = 2
+        block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
+        assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1
+
+        # randint() is inclusive for Python rng
+        rand_sent_idx = self.rng.randint(0, len(block) - 1)
+
+        # keep the query in the context query_in_block_prob fraction of the time.
+        if self.rng.random() < self.query_in_block_prob:
+            query = block[rand_sent_idx].copy()
+        else:
+            query = block.pop(rand_sent_idx)
+
+        # still need to truncate because blocks are concluded when
+        # the sentence lengths have exceeded max_seq_length.
+        query = query[:self.max_seq_length - 2]
+        block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset]
+
+        query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
+        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
+        block_data = sample_data.as_array()
+
+        sample = {
+            'query_tokens': query_tokens,
+            'query_pad_mask': query_pad_mask,
+            'block_tokens': block_tokens,
+            'block_pad_mask': block_pad_mask,
+            'block_data': block_data,
+        }
+
+        return sample
+
+    def get_block(self, start_idx, end_idx, doc_idx):
+        """Get the IDs for an evidence block plus the title of the corresponding document"""
+        block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
+        title = self.title_dataset[int(doc_idx)]
+
+        block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))]
+        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
+
+        return block_tokens, block_pad_mask
+
+    def get_null_block(self):
+        """Get empty block and title - used in REALM pretraining"""
+        block, title = [], []
+        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
+
+        return block_tokens, block_pad_mask
+
+    def concat_and_pad_tokens(self, tokens, title=None):
+        """Concat with special tokens and pad sequence to self.max_seq_length"""
+        tokens = list(tokens)
+        if title is None:
+            tokens = [self.cls_id] + tokens + [self.sep_id]
+        else:
+            title = list(title)
+            tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id]
+        assert len(tokens) <= self.max_seq_length
+
+        num_pad = self.max_seq_length - len(tokens)
+        pad_mask = [1] * len(tokens) + [0] * num_pad
+        tokens += [self.pad_id] * num_pad
+
+        return np.array(tokens), np.array(pad_mask)
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
new file mode 100644
index 00000000..12510662
--- /dev/null
+++ b/megatron/data/indexed_dataset.py
@@ -0,0 +1,570 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+
+import numpy as np
+import torch
+from megatron import print_rank_0
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, skip_warmup=False):
+    if not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+    if impl == 'infer':
+        impl = infer_dataset_impl(path)
+    if impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float,
+    7: np.double,
+    8: np.uint16
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1,)
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.doc_count = struct.unpack('<Q', f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx: ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx: ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float: 4,
+        np.double: 8
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, tensor):
+        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack('<Q', len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int32,
+                count=self._len,
+                offset=offset)
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=size, offset=ptr)
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """ Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                 count=length, offset=ptr)
+        return np_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        for size in index.sizes:
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
diff --git a/megatron/data/realm_dataset_utils.py b/megatron/data/realm_dataset_utils.py
new file mode 100644
index 00000000..68aed4a4
--- /dev/null
+++ b/megatron/data/realm_dataset_utils.py
@@ -0,0 +1,201 @@
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import mpu, print_rank_0
+from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
+from megatron.data.samplers import DistributedBatchSampler
+from megatron import get_args, get_tokenizer, print_rank_0, mpu
+
+
+def get_one_epoch_dataloader(dataset, batch_size=None):
+    """Specifically one epoch to be used in an indexing job."""
+    args = get_args()
+
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    if batch_size is None:
+        batch_size = args.batch_size
+    global_batch_size = batch_size * world_size
+    num_workers = args.num_workers
+
+    sampler = torch.utils.data.SequentialSampler(dataset)
+    # importantly, drop_last must be False to get all the data.
+    batch_sampler = DistributedBatchSampler(sampler,
+                                            batch_size=global_batch_size,
+                                            drop_last=False,
+                                            rank=rank,
+                                            world_size=world_size)
+
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=num_workers,
+                                       pin_memory=True)
+
+
+def get_ict_batch(data_iterator):
+    # Items and their type.
+    keys = ['query_tokens', 'query_pad_mask',
+            'block_tokens', 'block_pad_mask', 'block_data']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is None:
+        data = None
+    else:
+        data = next(data_iterator)
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    query_tokens = data_b['query_tokens'].long()
+    query_pad_mask = data_b['query_pad_mask'].long()
+    block_tokens = data_b['block_tokens'].long()
+    block_pad_mask = data_b['block_pad_mask'].long()
+    block_indices = data_b['block_data'].long()
+
+    return query_tokens, query_pad_mask,\
+           block_tokens, block_pad_mask, block_indices
+
+
+def join_str_list(str_list):
+    """Join a list of strings, handling spaces appropriately"""
+    result = ""
+    for s in str_list:
+        if s.startswith("##"):
+            result += s[2:]
+        else:
+            result += " " + s
+    return result
+
+
+class BlockSampleData(object):
+    """A struct for fully describing a fixed-size block of data as used in REALM
+
+    :param start_idx: for first sentence of the block
+    :param end_idx: for last sentence of the block (may be partially truncated in sample construction)
+    :param doc_idx: the index of the document from which the block comes in the original indexed dataset
+    :param block_idx: a unique integer identifier given to every block.
+    """
+    def __init__(self, start_idx, end_idx, doc_idx, block_idx):
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+        self.doc_idx = doc_idx
+        self.block_idx = block_idx
+
+    def as_array(self):
+        return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64)
+
+    def as_tuple(self):
+        return self.start_idx, self.end_idx, self.doc_idx, self.block_idx
+
+
+class BlockSamplesMapping(object):
+    def __init__(self, mapping_array):
+        # make sure that the array is compatible with BlockSampleData
+        assert mapping_array.shape[1] == 4
+        self.mapping_array = mapping_array
+
+    def __len__(self):
+        return self.mapping_array.shape[0]
+
+    def __getitem__(self, idx):
+        """Get the data associated with an indexed sample."""
+        sample_data = BlockSampleData(*self.mapping_array[idx])
+        return sample_data
+
+
+def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs,
+                              max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False):
+    """Get samples mapping for a dataset over fixed size blocks. This function also requires
+    a dataset of the titles for the source documents since their lengths must be taken into account.
+
+    :return: samples_mapping (BlockSamplesMapping)
+    """
+
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{}s'.format(seed)
+    if use_one_sent_docs:
+        indexmap_filename += '_1sentok'
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if mpu.get_data_parallel_rank() == 0 and \
+            not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+
+        # Make sure the types match the helpers input types.
+        assert block_dataset.doc_idx.dtype == np.int64
+        assert block_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building samples index mapping for {} ...'.format(
+            name))
+
+        # compile/bind the C++ helper code
+        from megatron.data.dataset_utils import compile_helper
+        compile_helper()
+
+        from megatron.data import helpers
+        mapping_array = helpers.build_blocks_mapping(
+            block_dataset.doc_idx,
+            block_dataset.sizes,
+            title_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length - 3,  # account for added tokens
+            seed,
+            verbose,
+            use_one_sent_docs)
+
+
+        print_rank_0(' > done building samples index mapping')
+        np.save(indexmap_filename, mapping_array, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elapsed time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+            time.time() - start_time))
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+
+    mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
+    samples_mapping = BlockSamplesMapping(mapping_array)
+
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        mapping_array.shape[0]))
+
+    return samples_mapping
diff --git a/megatron/data/realm_index.py b/megatron/data/realm_index.py
new file mode 100644
index 00000000..54344e0c
--- /dev/null
+++ b/megatron/data/realm_index.py
@@ -0,0 +1,216 @@
+import itertools
+import os
+import pickle
+import shutil
+
+import numpy as np
+import torch
+
+from megatron import get_args
+from megatron import mpu
+
+
+def detach(tensor):
+    return tensor.detach().cpu().numpy()
+
+
+class BlockData(object):
+    """Serializable data structure for holding data for blocks -- embeddings and necessary metadata for REALM"""
+    def __init__(self, block_data_path=None, load_from_path=True, rank=None):
+        self.embed_data = dict()
+        self.meta_data = dict()
+        if block_data_path is None:
+            args = get_args()
+            block_data_path = args.block_data_path
+            rank = args.rank
+        self.block_data_path = block_data_path
+        self.rank = rank
+
+        if load_from_path:
+            self.load_from_file()
+
+        block_data_name = os.path.splitext(self.block_data_path)[0]
+        self.temp_dir_name = block_data_name + '_tmp'
+
+    def state(self):
+        return {
+            'embed_data': self.embed_data,
+            'meta_data': self.meta_data,
+        }
+
+    def clear(self):
+        """Clear the embedding data structures to save memory.
+        The metadata ends up getting used, and is also much smaller in dimensionality
+        so it isn't really worth clearing.
+        """
+        self.embed_data = dict()
+
+    def load_from_file(self):
+        """Populate members from instance saved to file"""
+
+        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
+            print("\n> Unpickling BlockData", flush=True)
+        state_dict = pickle.load(open(self.block_data_path, 'rb'))
+        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
+            print(">> Finished unpickling BlockData\n", flush=True)
+
+        self.embed_data = state_dict['embed_data']
+        self.meta_data = state_dict['meta_data']
+
+    def add_block_data(self, block_indices, block_embeds, block_metas, allow_overwrite=False):
+        """Add data for set of blocks
+        :param block_indices: 1D array of unique int ids for the blocks
+        :param block_embeds: 2D array of embeddings of the blocks
+        :param block_metas: 2D array of metadata for the blocks.
+            In the case of REALM this will be [start_idx, end_idx, doc_idx]
+        """
+        for idx, embed, meta in zip(block_indices, block_embeds, block_metas):
+            if not allow_overwrite and idx in self.embed_data:
+                raise ValueError("Unexpectedly tried to overwrite block data")
+
+            self.embed_data[idx] = np.float16(embed)
+            self.meta_data[idx] = meta
+
+    def save_shard(self):
+        """Save the block data that was created this in this process"""
+        if not os.path.isdir(self.temp_dir_name):
+            os.makedirs(self.temp_dir_name, exist_ok=True)
+
+        # save the data for each shard
+        with open('{}/{}.pkl'.format(self.temp_dir_name, self.rank), 'wb') as data_file:
+            pickle.dump(self.state(), data_file)
+
+    def merge_shards_and_save(self):
+        """Combine all the shards made using self.save_shard()"""
+        shard_names = os.listdir(self.temp_dir_name)
+        seen_own_shard = False
+
+        for fname in os.listdir(self.temp_dir_name):
+            shard_rank = int(os.path.splitext(fname)[0])
+            if shard_rank == self.rank:
+                seen_own_shard = True
+                continue
+
+            with open('{}/{}'.format(self.temp_dir_name, fname), 'rb') as f:
+                data = pickle.load(f)
+                old_size = len(self.embed_data)
+                shard_size = len(data['embed_data'])
+
+                # add the shard's data and check to make sure there is no overlap
+                self.embed_data.update(data['embed_data'])
+                self.meta_data.update(data['meta_data'])
+                assert len(self.embed_data) == old_size + shard_size
+
+        assert seen_own_shard
+
+        # save the consolidated shards and remove temporary directory
+        with open(self.block_data_path, 'wb') as final_file:
+            pickle.dump(self.state(), final_file)
+        shutil.rmtree(self.temp_dir_name, ignore_errors=True)
+
+        print("Finished merging {} shards for a total of {} embeds".format(
+            len(shard_names), len(self.embed_data)), flush=True)
+
+
+class FaissMIPSIndex(object):
+    """Wrapper object for a BlockData which similarity search via FAISS under the hood"""
+    def __init__(self, embed_size, block_data=None, use_gpu=False):
+        self.embed_size = embed_size
+        self.block_data = block_data
+        self.use_gpu = use_gpu
+        self.id_map = dict()
+
+        self.block_mips_index = None
+        self._set_block_index()
+
+    def _set_block_index(self):
+        """Create a Faiss Flat index with inner product as the metric to search against"""
+        try:
+            import faiss
+        except ImportError:
+            raise Exception("Error: Please install faiss to use FaissMIPSIndex")
+
+        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
+            print("\n> Building index", flush=True)
+        self.block_mips_index = faiss.index_factory(self.embed_size, 'Flat', faiss.METRIC_INNER_PRODUCT)
+
+        if self.use_gpu:
+            # create resources and config for GpuIndex
+            res = faiss.StandardGpuResources()
+            config = faiss.GpuIndexFlatConfig()
+            config.device = torch.cuda.current_device()
+            config.useFloat16 = True
+
+            self.block_mips_index = faiss.GpuIndexFlat(res, self.block_mips_index, config)
+            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
+                print(">> Initialized index on GPU {}".format(self.block_mips_index.getDevice()), flush=True)
+        else:
+            # CPU index supports IDs so wrap with IDMap
+            self.block_mips_index = faiss.IndexIDMap(self.block_mips_index)
+            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
+                print(">> Initialized index on CPU", flush=True)
+
+        # if we were constructed with a BlockData, then automatically load it when the FAISS structure is built
+        if self.block_data is not None:
+            self.add_block_embed_data(self.block_data)
+
+    def reset_index(self):
+        """Delete existing index and create anew"""
+        del self.block_mips_index
+
+        # reset the block data so that _set_block_index will reload it as well
+        if self.block_data is not None:
+            block_data_path = self.block_data.block_data_path
+            del self.block_data
+            self.block_data = BlockData(block_data_path)
+
+        self._set_block_index()
+
+    def add_block_embed_data(self, all_block_data):
+        """Add the embedding of each block to the underlying FAISS index"""
+
+        # this assumes the embed_data is a dict : {int: np.array<float>}
+        block_indices, block_embeds = zip(*all_block_data.embed_data.items())
+
+        # the embeddings have to be entered in as float32 even though the math internally is done with float16.
+        block_embeds_arr = np.float32(np.array(block_embeds))
+        block_indices_arr = np.array(block_indices)
+
+        # faiss GpuIndex doesn't work with IDMap wrapper so store ids to map back with
+        if self.use_gpu:
+            for i, idx in enumerate(block_indices):
+                self.id_map[i] = idx
+
+        # we no longer need the embedding data since it's in the index now
+        all_block_data.clear()
+
+        if self.use_gpu:
+            self.block_mips_index.add(block_embeds_arr)
+        else:
+            self.block_mips_index.add_with_ids(block_embeds_arr, block_indices_arr)
+
+        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
+            print(">>> Finished adding block data to index", flush=True)
+
+    def search_mips_index(self, query_embeds, top_k, reconstruct=True):
+        """Get the top-k blocks by the index distance metric.
+
+        :param reconstruct: if True: return a [num_queries x k x embed_dim] array of blocks
+                            if False: return [num_queries x k] array of distances, and another for indices
+        """
+        query_embeds = np.float32(detach(query_embeds))
+
+        if reconstruct:
+            # get the vectors themselves
+            top_k_block_embeds = self.block_mips_index.search_and_reconstruct(query_embeds, top_k)
+            return top_k_block_embeds
+
+        else:
+            # get distances and indices of closest vectors
+            distances, block_indices = self.block_mips_index.search(query_embeds, top_k)
+            if self.use_gpu:
+                fresh_indices = np.zeros(block_indices.shape)
+                for i, j in itertools.product(block_indices.shape):
+                    fresh_indices[i, j] = self.id_map[block_indices[i, j]]
+                block_indices = fresh_indices
+            return distances, block_indices
diff --git a/megatron/data/samplers.py b/megatron/data/samplers.py
new file mode 100644
index 00000000..2fbd0701
--- /dev/null
+++ b/megatron/data/samplers.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch samplers that work with either random or sequential data samplers."""
+
+import torch
+from torch.utils import data
+
+
+class RandomSampler(data.sampler.Sampler):
+    """Based off of pytorch RandomSampler and DistributedSampler. Essentially
+    a RandomSampler, but this class lets the user set an epoch like
+    DistributedSampler Samples elements randomly. If without replacement, then
+    sample from a shuffled dataset. If with replacement, then user can
+    specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``,
+        default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError("With replacement=False, num_samples should not "
+                             "be specified, since a random permute will be "
+                             "performed.")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer "
+                             "value, but got num_samples={}".format(
+                                 self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError("replacement should be a boolean value, but got "
+                             "replacement={}".format(self.replacement))
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            return iter(torch.randint(high=n, size=(self.num_samples,),
+                                      dtype=torch.int64, generator=g).tolist())
+        return iter(torch.randperm(n, generator=g).tolist())
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """Similar to normal implementation of distributed sampler, except
+    implementation is at the batch sampler level, instead of just the
+    sampler level. This allows wrapping of arbitrary data samplers
+    (sequential, random, WeightedRandomSampler, etc.) with this batch
+    sampler.
+    
+    The `interleave` argument specifies how to distribute a batch. A value
+    of True combined with the above random sampler is equivalent to pytorch's
+    torch.utils.data.distributed.DistributedSampler.
+
+    For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2 
+    specifying True will result in the following samples for each gpu:
+        GPU0: [0,2,4,6] GPU1: [1,3,5,7]
+    specifying False will result in the following samples:
+        GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
+
+    def __init__(self, sampler, batch_size, drop_last, rank=-1,
+                 world_size=2, wrap_last=False, interleave=False):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
+                                                      drop_last)
+        if rank == -1:
+            assert False, 'should not be here'
+            rank = torch.distributed.get_rank()
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.interleave = interleave
+
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter:
+                    yield tbatch
+                    self.start_iter = 0
+                i += 1
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= (self.batch_size)
+                self.wrap_around += (len(batch))
+                self.wrap_around %= self.batch_size
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around % self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        if self.interleave:
+            return batch[self.rank:self.batch_size:self.world_size]
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
diff --git a/megatron/data/test/test_indexed_dataset.py b/megatron/data/test/test_indexed_dataset.py
new file mode 100644
index 00000000..9103c6d9
--- /dev/null
+++ b/megatron/data/test/test_indexed_dataset.py
@@ -0,0 +1,125 @@
+# This file isn't really a formal automated test, it's just a place to
+# put some code used during development and manual testing of
+# indexed_dataset.
+
+from megatron.data import indexed_dataset
+from megatron.tokenizer import build_tokenizer
+import argparse
+import os
+import sys
+
+import torch
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.join(script_dir, "../../../"))
+
+
+def test_indexed_dataset(args):
+    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    tokenizer = build_tokenizer(args)
+    print(len(ds.doc_idx))
+    print(len(ds))
+    print(ds.doc_idx[-1])
+    if ds.supports_prefetch:
+        # just prefetch the whole thing in test (so assume it is small)
+        ds.prefetch(range(len(ds)))
+    if args.count > len(ds.doc_idx) - 1:
+        args.count = len(ds.doc_idx) - 1
+
+    for i in range(args.count):
+        start = ds.doc_idx[i]
+        end = ds.doc_idx[i + 1]
+        ids = ds[start:end]
+        print(f"Document {i}:")
+        print("--------------")
+        for s in ids:
+            assert len(s) > 0
+            l = s.data.tolist()
+            text = tokenizer.detokenize(l)
+            print(text)
+            print("---")
+
+
+def test_indexed_dataset_get(args):
+    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    tokenizer = build_tokenizer(args)
+    size = ds.sizes[0]
+    print(f"size: {size}")
+    full = ds.get(0)
+    print(full)
+    # print(tokenizer.detokenize(full.data.tolist()))
+    print("---")
+    end = ds.get(0, offset=size - 10)
+    print(end)
+    # print(tokenizer.detokenize(end.data.tolist()))
+
+    start = ds.get(0, length=10)
+    print(start)
+    # print(tokenizer.detokenize(start.data.tolist()))
+
+    part = ds.get(0, offset=2, length=8)
+    print(part)
+    # print(tokenizer.detokenize(part.data.tolist()))
+
+# def test_albert_dataset(args):
+#     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
+#     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+#     # ds = AlbertDataset(idataset, tokenizer)
+#     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
+#                                   args.epochs, args.max_num_samples,
+#                                   args.masked_lm_prob, args.seq_length,
+#                                   args.short_seq_prob, args.seed)
+#     truncated = 0
+#     total = 0
+#     for i, s in enumerate(ds):
+#         ids = s['text']
+#         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
+#         print(tokens)
+#         if i >= args.count-1:
+#             exit()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, help='prefix to data files')
+    parser.add_argument('--dataset-impl', type=str, default='infer',
+                        choices=['lazy', 'cached', 'mmap', 'infer'])
+    parser.add_argument('--count', type=int, default=10,
+                        help='Number of samples/documents to print')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase',
+                                'GPT2BPETokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+
+    parser.add_argument('--epochs', type=int, default=5,
+                        help='Number of epochs to plan for')
+    parser.add_argument('--max-num-samples', type=int, default=None,
+                        help='Maximum number of samples to plan for')
+    parser.add_argument('--masked-lm-prob', type=float, default=0.15,
+                        help='probability of masking tokens')
+    parser.add_argument('--seq-length', type=int, default=512,
+                        help='maximum sequence length')
+    parser.add_argument('--short-seq-prob', type=float, default=0.1,
+                        help='probability of creating a short sequence')
+    parser.add_argument('--seed', type=int, default=1234,
+                        help='random seed')
+    args = parser.parse_args()
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.model_parallel_size = 1
+
+    if args.dataset_impl == "infer":
+        args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
+
+#    test_albert_dataset(args)
+    test_indexed_dataset_get(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/megatron/data/test/test_preprocess_data.sh b/megatron/data/test/test_preprocess_data.sh
new file mode 100755
index 00000000..d121c859
--- /dev/null
+++ b/megatron/data/test/test_preprocess_data.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+IMPL=cached
+python ../preprocess_data.py \
+       --input test_samples.json \
+       --vocab vocab.txt \
+       --dataset-impl ${IMPL} \
+       --output-prefix test_samples_${IMPL} \
+       --workers 1 \
+       --log-interval 2
diff --git a/data_utils/__init__.py b/megatron/deprecated_data_utils/__init__.py
similarity index 60%
rename from data_utils/__init__.py
rename to megatron/deprecated_data_utils/__init__.py
index 7a60f97b..abefedce 100644
--- a/data_utils/__init__.py
+++ b/megatron/deprecated_data_utils/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,58 +16,67 @@
 import os
 import math
 
+import torch
+
 from .samplers import DistributedBatchSampler
-from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset
+from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
 from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
-from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, make_tokenizer
+from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
 from . import corpora
 
 TRAIN_DATA = 0
 VAL_DATA = 1
 TEST_DATA = 2
 
+
 def should_split(split):
     """
     given split proportions checks if should split
     Examples:
-    >>> should_split([10,0,0]) 
+    >>> should_split([10,0,0])
     False
     >>> should_split([1,.1,.2])
     True
     """
-    return max(split)/sum(split) != 1.
+    return max(split) / sum(split) != 1.
+
 
 def get_ext(path):
     """gets path extension"""
     return os.path.splitext(path)[1]
 
+
 def get_dataset(path, **kwargs):
     """gets dataset object based on keyword args and file at `path`"""
     if supported_corpus(path):
         return corpora.NAMED_CORPORA[path](**kwargs)
     ext = get_ext(path)
-    if ext =='.json':
+    if '.json' in ext:
         text = json_dataset(path, **kwargs)
     elif ext in ['.csv', '.tsv']:
         text = csv_dataset(path, **kwargs)
     else:
-        raise NotImplementedError('data file type %s is not supported'%(ext))
+        raise NotImplementedError('data file type %s is not supported' % (ext))
     return text
 
+
 def supported_corpus(corpus_name):
     """checks if corpus name is defined in `corpora.py`"""
     return corpus_name in corpora.NAMED_CORPORA
 
+
 def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
-                delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
-                tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
-                model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs):
+                 delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
+                 tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
+                 model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
+                 parallel_group=None, **kwargs):
     """function to create datasets+tokenizers for common options"""
     if isinstance(process_fn, str):
         process_fn = eval(process_fn)
     if non_binary_cols is not None:
         # multilabel dataset support (only for csvs)
         label_key = non_binary_cols
+
     def get_dataset_from_path(path_):
         if lazy:
             # get lazily loaded dataset
@@ -76,16 +85,24 @@ def get_dataset_from_path(path_):
                 named_corpora = True
                 name = path_
                 path_ = corpora.NAMED_CORPORA[path_].PATH
-            if not exists_lazy(path_, data_type='data'):
+            if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
                 # create cached version of dataset for lazy loading if it doesn't exist
                 text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
-                    delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
+                                   delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
                 make_lazy(path_, text.X, data_type='data')
+            # This should be a barrier but nccl barrier assumes
+            # device_index=rank which is not the case for model
+            # parallel case
+            counts = torch.cuda.LongTensor([1])
+            torch.distributed.all_reduce(counts, group=parallel_group)
+            assert counts[0].item() == torch.distributed.get_world_size(
+                group=parallel_group)
+
             text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
         else:
             # get dataset
             text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
-                    delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
+                               delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
         return text
     # get one or multiple datasets and concatenate
     if isinstance(path, str):
@@ -97,8 +114,8 @@ def get_dataset_from_path(path_):
         ds = ConcatDataset(datasets)
     # make tokenizer for dataset
     if tokenizer is None:
-        tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type, 
-                                    pad_token, character_converage, **kwargs)
+        tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
+                                   pad_token, character_converage, **kwargs)
 
     ds_type = ''
     if 'ds_type' in kwargs:
@@ -107,9 +124,18 @@ def get_dataset_from_path(path_):
     # Split dataset into train/val/test (and wrap bert dataset)
     if should_split(split):
         ds = split_ds(ds, split)
-        if ds_type.lower() == 'bert':
-            ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length) for d in ds]
+        if 'bert' in ds_type.lower():
+            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            dstype = bert_sentencepair_dataset
+            ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
+                  if d is not None else None for d in ds]
+        elif ds_type.lower() == 'gpt2':
+            ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
     else:
-        if ds_type.lower() == 'bert':
-            ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length)
+        if 'bert' in ds_type.lower():
+            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            dstype = bert_sentencepair_dataset
+            ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
+        elif ds_type.lower() == 'gpt2':
+            ds = GPT2Dataset(ds, max_seq_len=seq_length)
     return ds, tokenizer
diff --git a/configure_data.py b/megatron/deprecated_data_utils/configure_data.py
similarity index 79%
rename from configure_data.py
rename to megatron/deprecated_data_utils/configure_data.py
index fa1dd92b..357c2380 100644
--- a/configure_data.py
+++ b/megatron/deprecated_data_utils/configure_data.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,9 @@
 
 import copy
 import torch
-import data_utils
+
+from megatron import data_utils
+from megatron import mpu
 
 
 class DataConfig:
@@ -27,7 +29,8 @@ def __init__(self, defaults={}):
         self.defaults = defaults
 
     def apply(self, args):
-        print('configuring data')
+        if torch.distributed.get_rank() == 0:
+            print('configuring data')
         self.apply_defaults(args)
         return make_loaders(args)
 
@@ -46,11 +49,13 @@ def make_data_loader(dataset, batch_size, args):
 
     shuffle = args.shuffle
     if shuffle:
-        sampler = torch.utils.data.RandomSampler(dataset)
+        sampler = data_utils.samplers.RandomSampler(
+            dataset, replacement=True, num_samples=batch_size * args.train_iters)
     else:
         sampler = torch.utils.data.SequentialSampler(dataset)
-    world_size = args.world_size
-    rank = args.rank
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
     distributed = world_size > 1
     drop_last = distributed
 
@@ -76,13 +81,15 @@ def make_data_loader(dataset, batch_size, args):
 def make_tfrecord_loaders(args):
     """Load train/val/test dataset from shuffled TFRecords"""
 
-    import data_utils.tf_dl 
+    import data_utils.tf_dl
     data_set_args = {'batch_size': args.batch_size,
                      'max_seq_len': args.seq_length,
                      'max_preds_per_seq': args.max_preds_per_seq,
                      'train': True,
-                     'num_workers': args.num_workers,
-                     'seed': args.seed+args.rank+1}
+                     'num_workers': max(args.num_workers, 1),
+                     'seed': args.seed + args.rank + 1,
+                     'threaded_dl': args.num_workers > 0
+                     }
     train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
                                                 **data_set_args)
     data_set_args['train'] = False
@@ -111,23 +118,27 @@ def make_tfrecord_loaders(args):
 def make_loaders(args):
     """makes training/val/test"""
 
-    if args.use_tfrecords:
+    if args.data_loader == 'tfrecords':
         return make_tfrecord_loaders(args)
-    batch_size = args.batch_size * args.world_size
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    batch_size = args.batch_size * world_size
     eval_batch_size = batch_size
     if args.eval_batch_size is not None:
-        eval_batch_size = args.eval_batch_size * args.world_size
+        eval_batch_size = args.eval_batch_size * world_size
     seq_length = args.seq_length
     if seq_length < 0:
-        seq_length = seq_length * args.world_size
+        seq_length = seq_length * world_size
     eval_seq_length = args.eval_seq_length
     if eval_seq_length is not None and eval_seq_length < 0:
-        eval_seq_length = eval_seq_length * args.world_size
+        eval_seq_length = eval_seq_length * world_size
     split = get_split(args)
+    if args.data_path is not None:
+        args.train_data = args.data_path
     data_set_args = {
         'path': args.train_data,
         'seq_length': seq_length,
-        'lazy': args.lazy_loader,
+        'lazy': args.data_loader == 'lazy',
         'delim': args.delim,
         'text_key': args.text_key,
         'label_key': 'label',
@@ -140,7 +151,9 @@ def make_loaders(args):
         'vocab_size': args.vocab_size,
         'model_type': args.tokenizer_model_type,
         'cache_dir': args.cache_dir,
-        'max_preds_per_seq': args.max_preds_per_seq}
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'presplit_sentences': args.presplit_sentences,
+        'parallel_group': mpu.get_data_parallel_group()}
 
     eval_set_args = copy.copy(data_set_args)
     eval_set_args['split'] = [1.]
@@ -162,27 +175,38 @@ def make_loaders(args):
         train, tokenizer = data_utils.make_dataset(**data_set_args)
         if data_utils.should_split(split):
             train, valid, test = train
-    eval_set_args['tokenizer'] = tokenizer
+        eval_set_args['tokenizer'] = tokenizer
 
     # make training and val dataset if necessary
     if valid is None and args.valid_data is not None:
         eval_set_args['path'] = args.valid_data
-        valid, _ = data_utils.make_dataset(**eval_set_args)
+        valid, tokenizer = data_utils.make_dataset(**eval_set_args)
+        eval_set_args['tokenizer'] = tokenizer
     if test is None and args.test_data is not None:
         eval_set_args['path'] = args.test_data
-        test, _ = data_utils.make_dataset(**eval_set_args)
+        test, tokenizer = data_utils.make_dataset(**eval_set_args)
 
     # wrap datasets with data loader
     if train is not None and args.batch_size > 0:
         train = make_data_loader(train, batch_size, args)
+        args.do_train = True
+    else:
+        args.do_train = False
     eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
     if valid is not None:
         valid = make_data_loader(valid, eval_batch_size, args)
+        args.do_valid = True
+    else:
+        args.do_valid = False
     if test is not None:
         test = make_data_loader(test, eval_batch_size, args)
+        args.do_test = True
+    else:
+        args.do_test = False
 
     return (train, valid, test), tokenizer
 
+
 def get_split(args):
     """
     Get dataset splits from comma separated string list
@@ -196,7 +220,7 @@ def get_split(args):
         splits = [float(args.split)]
     split_total = sum(splits)
     if split_total < 1.:
-        splits.append(1-split_total)
+        splits.append(1 - split_total)
     while len(splits) < 3:
         splits.append(0.)
     splits = splits[:3]
@@ -205,10 +229,10 @@ def get_split(args):
     if args.test_data is not None:
         splits[2] = 0.
     final_sum = sum(splits)
-    return [s/final_sum for s in splits]
+    return [s / final_sum for s in splits]
 
-def configure_data():
 
+def configure_data():
     """add cmdline flags for configuring datasets"""
     # These are options that are used by data_utils, but are either
     # deprecated or not meant to be exposed to the command line user.
@@ -218,7 +242,6 @@ def configure_data():
         'rank': -1,
         'persist_state': 0,
         'lazy': False,
-        'shuffle': False,
         'transpose': False,
         'data_set_type': 'supervised',
         'seq_length': 256,
diff --git a/megatron/deprecated_data_utils/corpora.py b/megatron/deprecated_data_utils/corpora.py
new file mode 100755
index 00000000..73749d9c
--- /dev/null
+++ b/megatron/deprecated_data_utils/corpora.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""several datasets with preset arguments"""
+from .datasets import json_dataset, csv_dataset
+import os
+
+
+class wikipedia(json_dataset):
+    """
+    dataset for wikipedia with arguments configured for convenience
+
+    command line usage: `--train-data wikipedia`
+    """
+    PATH = 'data/wikipedia/wikidump_lines.json'
+    assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
+
+    def __init__(self, **kwargs):
+        assert os.path.exists(wikipedia.PATH), \
+            wikipedia.assert_str
+        if not kwargs:
+            kwargs = {}
+        kwargs['text_key'] = 'text'
+        kwargs['loose_json'] = True
+        super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
+
+
+class webtext(json_dataset):
+    """
+    dataset for webtext with arguments configured for convenience
+
+    command line usage: `--train-data webtext`
+    """
+    PATH = 'data/webtext/data.json'
+    assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
+
+    def __init__(self, **kwargs):
+        assert os.path.exists(webtext.PATH), \
+            webtext.assert_str
+        if not kwargs:
+            kwargs = {}
+        kwargs['text_key'] = 'text'
+        kwargs['loose_json'] = True
+        super(webtext, self).__init__(webtext.PATH, **kwargs)
+
+
+NAMED_CORPORA = {
+    'wikipedia': wikipedia,
+    'webtext': webtext,
+}
diff --git a/data_utils/datasets.py b/megatron/deprecated_data_utils/datasets.py
old mode 100644
new mode 100755
similarity index 69%
rename from data_utils/datasets.py
rename to megatron/deprecated_data_utils/datasets.py
index 88c2a1cc..bf8ef8a5
--- a/data_utils/datasets.py
+++ b/megatron/deprecated_data_utils/datasets.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,18 +22,19 @@
 import csv
 import math
 import random
+from itertools import accumulate
 
 from torch.utils import data
 import pandas as pd
 import numpy as np
 
 import nltk
-nltk.download('punkt')
 from nltk import tokenize
 
 from .lazy_loader import lazy_array_loader, exists_lazy, make_lazy
 from .tokenization import Tokenization
 
+
 class ConcatDataset(data.Dataset):
     """
     Dataset to concatenate multiple datasets.
@@ -57,9 +58,12 @@ def __init__(self, datasets, **kwargs):
         super(ConcatDataset, self).__init__()
         assert len(datasets) > 0, 'datasets should not be an empty iterable'
         self.datasets = list(datasets)
+        self.is_lazy = sum([isinstance(ds, lazy_array_loader)
+                            for ds in self.datasets]) == len(self.datasets)
         self.cumulative_sizes = self.cumsum(self.datasets)
         self._X = None
         self._Y = None
+        self._lens = None
 
     def SetTokenizer(self, tokenizer):
         for ds in self.datasets:
@@ -79,6 +83,19 @@ def __getitem__(self, idx):
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return self.datasets[dataset_idx][sample_idx]
 
+    @property
+    def lens(self):
+        if self._lens is None:
+            self._lens = []
+            if self.is_lazy:
+                for data in self.datasets:
+                    self._lens.extend(data.lens)
+            else:
+                for data in self.datasets:
+                    self._lens.extend([len(d['text']) if isinstance(
+                        d, dict) else len(d) for d in data])
+        return self._lens
+
     @property
     def X(self):
         if self._X is None:
@@ -102,6 +119,7 @@ def cummulative_sizes(self):
                       "cumulative_sizes", DeprecationWarning, stacklevel=2)
         return self.cumulative_sizes
 
+
 class SplitDataset(data.Dataset):
     """
     Dataset wrapper to access a subset of another dataset.
@@ -112,10 +130,11 @@ class SplitDataset(data.Dataset):
         ds (Dataset or array-like): List of datasets to be subindexed
         split_inds (1D array-like): List of indices part of subset
     """
+
     def __init__(self, ds, split_inds, **kwargs):
         self.split_inds = list(split_inds)
         self.wrapped_data = ds
-        self.is_lazy = isinstance(ds, lazy_array_loader)
+        self.is_lazy = isinstance(ds, lazy_array_loader) or (hasattr(ds, 'is_lazy') and ds.is_lazy)
         if self.is_lazy:
             self.lens = itemgetter(*self.split_inds)(list(self.wrapped_data.lens))
         self._X = None
@@ -149,7 +168,8 @@ def __iter__(self):
         for idx in self.split_inds:
             yield self.wrapped_data[idx]
 
-def split_ds(ds, split=[.8,.2,.0], shuffle=True):
+
+def split_ds(ds, split=[.8, .2, .0], shuffle=True):
     """
     Split a dataset into subsets given proportions of how
     much to allocate per split. If a split is 0% returns None for that split.
@@ -170,18 +190,19 @@ def split_ds(ds, split=[.8,.2,.0], shuffle=True):
         np.random.shuffle(inds)
     start_idx = 0
     residual_idx = 0
-    rtn_ds = [None]*len(split)
+    rtn_ds = [None] * len(split)
     for i, f in enumerate(split):
         if f != 0:
-            proportion = ds_len*split[i]
+            proportion = ds_len * split[i]
             residual_idx += proportion % 1
             split_ = int(int(proportion) + residual_idx)
-            split_inds = inds[start_idx:start_idx+max(split_, 1)]
+            split_inds = inds[start_idx:start_idx + max(split_, 1)]
             rtn_ds[i] = SplitDataset(ds, split_inds)
             start_idx += split_
             residual_idx %= 1
     return rtn_ds
 
+
 class csv_dataset(data.Dataset):
     """
     Class for loading datasets from csv files.
@@ -200,9 +221,11 @@ class csv_dataset(data.Dataset):
         X (list): all strings from the csv file
         Y (np.ndarray): labels to train with
     """
+
     def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
-                binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
-                **kwargs):
+                 binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
+                 **kwargs):
+        self.is_lazy = False
         self.preprocess_fn = preprocess_fn
         self.SetTokenizer(tokenizer)
         self.path = path
@@ -214,7 +237,6 @@ def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
         if '.tsv' in self.path:
             self.delim = '\t'
 
-
         self.X = []
         self.Y = []
         try:
@@ -224,7 +246,7 @@ def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
             else:
                 cols += [label_key]
             data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1')
-        except:
+        except BaseException:
             data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1')
 
         data = data.dropna(axis=0)
@@ -233,7 +255,7 @@ def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
         try:
             self.Y = data[label_key].values
         except Exception as e:
-            self.Y = np.ones(len(self.X))*-1
+            self.Y = np.ones(len(self.X)) * -1
 
         if binarize_sent:
             self.Y = binarize_labels(self.Y, hard=binarize_sent)
@@ -280,23 +302,25 @@ def write(self, writer_gen=None, path=None, skip_header=False):
             write the metrics, text, and labels to a csv file
         """
         if path is None:
-            path = self.path+'.results'
+            path = self.path + '.results'
         print('generating csv at ' + path)
         with open(path, 'w') as csvfile:
             c = csv.writer(csvfile, delimiter=self.delim)
             if writer_gen is not None:
-                #if first item of generator is a header of what the metrics mean then write header to csv file
+                # if first item of generator is a header of what the metrics mean then
+                # write header to csv file
                 if not skip_header:
-                    header = (self.label_key,)+tuple(next(writer_gen))+(self.text_key,)
+                    header = (self.label_key,) + tuple(next(writer_gen)) + (self.text_key,)
                     c.writerow(header)
                 for i, row in enumerate(writer_gen):
-                    row = (self.Y[i],)+tuple(row)+(self.X[i],)
+                    row = (self.Y[i],) + tuple(row) + (self.X[i],)
                     c.writerow(row)
             else:
                 c.writerow([self.label_key, self.text_key])
                 for row in zip(self.Y, self.X):
                     c.writerow(row)
 
+
 class json_dataset(data.Dataset):
     """
     Class for loading datasets from a json dump.
@@ -312,8 +336,10 @@ class json_dataset(data.Dataset):
         all_strs (list): list of all strings from the dataset
         all_labels (list): list of all labels from the dataset (if they have it)
     """
+
     def __init__(self, path, tokenizer=None, preprocess_fn=None, binarize_sent=False,
-                text_key='sentence', label_key='label', loose_json=False, **kwargs):
+                 text_key='sentence', label_key='label', loose_json=False, **kwargs):
+        self.is_lazy = False
         self.preprocess_fn = preprocess_fn
         self.path = path
         self.SetTokenizer(tokenizer)
@@ -373,24 +399,25 @@ def write(self, writer_gen=None, path=None, skip_header=False):
             write the metrics, text, and labels to a json file
         """
         if path is None:
-            path = self.path+'.results'
+            path = self.path + '.results'
 
         jsons = []
 
         if writer_gen is not None:
-            #if first item of generator is a header of what the metrics mean then write header to csv file
+            # if first item of generator is a header of what the metrics mean then
+            # write header to csv file
             def gen_helper():
                 keys = {}
                 keys[0] = self.label_key
                 if not skip_header:
                     for idx, k in enumerate(tuple(next(writer_gen))):
-                        keys[idx+1] = k
+                        keys[idx + 1] = k
                 for i, row in enumerate(writer_gen):
                     if i == 0 and skip_header:
                         for idx, _ in enumerate(row):
-                            keys[idx+1] = 'metric_%d'%(idx,)
+                            keys[idx + 1] = 'metric_%d' % (idx,)
                     j = {}
-                    for idx, v in enumerate((self.Y[i],)+tuple(row)):
+                    for idx, v in enumerate((self.Y[i],) + tuple(row)):
                         k = keys[idx]
                         j[k] = v
                     yield j
@@ -437,6 +464,124 @@ def gen_helper():
                 j[self.label_key] = -1
             yield j
 
+
+class GPT2Dataset(data.Dataset):
+
+    def __init__(self, ds,
+                 max_seq_len=1024,
+                 num_samples=None,
+                 weighted=True,
+                 sample_across_doc=True,
+                 random_across_doc_sampling=True,
+                 bias_for_single_doc=False,
+                 sentence_start=False, **kwargs):
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.num_samples = num_samples
+        if num_samples is None:
+            self.num_samples = 1000 * self.ds_len
+        self.max_seq_len = max_seq_len
+        self.tokenizer = self.ds.GetTokenizer()
+        self.ds.SetTokenizer(None)
+        self.weighted = weighted
+        self.sample_across_doc = sample_across_doc
+        self.random_across_doc_sampling = random_across_doc_sampling
+        self.bias_for_single_doc = bias_for_single_doc
+        self.sentence_start = sentence_start
+        self.init_weighting()
+
+    def init_weighting(self):
+        if self.weighted:
+            if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+                lens = np.array(self.ds.lens)
+            else:
+                lens = np.array([len(d['text']) if isinstance(d, dict)
+                                 else len(d) for d in self.ds])
+            self.total_len = np.sum(lens)
+            self.weighting = list(accumulate(lens))
+        else:
+            self.weighting = None
+
+    def get_weighted_samples(self, np_rng):
+        if self.weighting is not None:
+            idx = np_rng.randint(self.total_len)
+            return bisect_right(self.weighting, idx)
+        else:
+            return np_rng.randint(self.ds_len)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        # init rng
+        rng = random.Random(idx)
+        rng = np.random.RandomState(seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+
+        # get possibly weighted random index from dataset
+        data_idx = self.get_weighted_samples(rng)
+#        data_idx = rng.choice(self.ds_len, p=self.weighting)
+        tokens = self.getidx(data_idx)
+
+        # truncate or pad tokens
+        num_tokens = len(tokens)
+        if self.bias_for_single_doc:
+            tokens_to_strip = num_tokens - self.max_seq_len - 1
+        else:
+            tokens_to_strip = num_tokens - 1
+        if tokens_to_strip > 0:
+            strip_left_tokens = rng.randint(tokens_to_strip + 1)
+            tokens = tokens[strip_left_tokens:]
+            if self.sentence_start:
+                token_copy = list(tokens)
+                not_done = True
+                while (len(token_copy) > 0) and not_done:
+                    tok = token_copy.pop(0)
+                    if self.contains_sentence_end(tok):
+                        tokens = token_copy
+                        not_done = False
+            strip_right_rokens = len(tokens) - self.max_seq_len - 1
+            if strip_right_rokens > 0:
+                tokens = tokens[:-strip_right_rokens]
+
+        if self.sample_across_doc:
+            while (len(tokens) < (self.max_seq_len + 1)):
+                if self.random_across_doc_sampling:
+                    data_idx = self.get_weighted_samples(rng)
+                else:
+                    data_idx = (data_idx + 1) % self.ds_len
+                tokens += self.getidx(data_idx)
+            tokens = tokens[:(self.max_seq_len + 1)]
+
+        tokens = self.pad_seq(tokens)
+        return {'text': np.array(tokens), }
+
+    def getidx(self, data_idx):
+        data = self.ds[data_idx]
+        if isinstance(data, dict):
+            data = data['text']
+        # tokenize
+        tokenization = self.tokenizer.EncodeAsIds(data)
+        tokenization.append(self.tokenizer.get_command('eos'))
+        tokens = tokenization.tokenization
+        return tokens
+
+    def pad_seq(self, seq):
+        total_tokens = self.max_seq_len + 1
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [self.tokenizer.get_command('pad').Id] * (num_pad_tokens)
+        return seq
+
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        return False
+
+
 class bert_sentencepair_dataset(data.Dataset):
     """
     Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
@@ -449,7 +594,9 @@ class bert_sentencepair_dataset(data.Dataset):
         dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)
 
     """
-    def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None, short_seq_prob=.01, dataset_size=None, **kwargs):
+
+    def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None,
+                 short_seq_prob=.01, dataset_size=None, presplit_sentences=False, weighted=True, **kwargs):
         self.ds = ds
         self.ds_len = len(self.ds)
         self.tokenizer = self.ds.GetTokenizer()
@@ -458,12 +605,36 @@ def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None
         self.max_seq_len = max_seq_len
         self.mask_lm_prob = mask_lm_prob
         if max_preds_per_seq is None:
-            max_preds_per_seq = math.ceil(max_seq_len*mask_lm_prob /10)*10
+            max_preds_per_seq = math.ceil(max_seq_len * mask_lm_prob / 10) * 10
         self.max_preds_per_seq = max_preds_per_seq
         self.short_seq_prob = short_seq_prob
         self.dataset_size = dataset_size
         if self.dataset_size is None:
-            self.dataset_size = self.ds_len * (self.ds_len-1)
+            self.dataset_size = self.ds_len * (self.ds_len - 1)
+        self.presplit_sentences = presplit_sentences
+        if not self.presplit_sentences:
+            nltk.download('punkt', download_dir="./nltk")
+        self.weighted = weighted
+        self.get_weighting()
+
+    def get_weighting(self):
+        if self.weighted:
+            if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+                lens = np.array(self.ds.lens)
+            else:
+                lens = np.array([len(d['text']) if isinstance(d, dict) else len(d)
+                                 for d in self.ds])
+            self.total_len = np.sum(lens)
+            self.weighting = list(accumulate(lens))
+        else:
+            self.weighting = None
+
+    def get_weighted_samples(self, np_rng):
+        if self.weighting is not None:
+            idx = np_rng.randint(self.total_len)
+            return bisect_right(self.weighting, idx)
+        else:
+            return np_rng.randint(self.ds_len)
 
     def __len__(self):
         return self.dataset_size
@@ -471,36 +642,56 @@ def __len__(self):
     def __getitem__(self, idx):
         # get rng state corresponding to index (allows deterministic random pair)
         rng = random.Random(idx)
+        np_rng = np.random.RandomState(seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
         # get seq length
         target_seq_length = self.max_seq_len
         short_seq = False
         if rng.random() < self.short_seq_prob:
             target_seq_length = rng.randint(2, target_seq_length)
             short_seq = True
+
         # get sentence pair and label
         is_random_next = None
         lena = 0
         lenb = 0
         while (is_random_next is None) or (lena < 1) or (lenb < 1):
-            tokensa, tokensb, is_random_next = self.create_random_sentencepair(target_seq_length, rng)
+            tokensa, tokensb, is_random_next = self.create_random_sentencepair(
+                target_seq_length, rng, np_rng)
             lena = len(tokensa[0])
             lenb = len(tokensb[0])
+
         # truncate sentence pair to max_seq_len
         tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb, self.max_seq_len, rng)
         # join sentence pair, mask, and pad
-        tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq, self.vocab_words, rng)
-        sample = {'text': np.array(tokens[0]), 'types': np.array(tokens[1]), 'is_random': int(is_random_next), 'mask': np.array(mask), 'mask_labels': np.array(mask_labels), 'pad_mask': np.array(pad_mask)}
+        tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(
+            tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq, self.vocab_words, rng)
+        sample = {
+            'text': np.array(
+                tokens[0]),
+            'types': np.array(
+                tokens[1]),
+            'is_random': int(is_random_next),
+            'mask': np.array(mask),
+            'mask_labels': np.array(mask_labels),
+            'pad_mask': np.array(pad_mask)}
         return sample
 
     def sentence_split(self, document):
         """split document into sentences"""
-        return tokenize.sent_tokenize(document)
+        lines = document.split('\n')
+        if self.presplit_sentences:
+            return [line for line in lines if line]
+        rtn = []
+        for line in lines:
+            if line != '':
+                rtn.extend(tokenize.sent_tokenize(line))
+        return rtn
 
     def sentence_tokenize(self, sent, sentence_num=0, beginning=False, ending=False):
         """tokenize sentence and get token types"""
         tokens = self.tokenizer.EncodeAsIds(sent).tokenization
         str_type = 'str' + str(sentence_num)
-        token_types = [self.tokenizer.get_type(str_type).Id]*len(tokens)
+        token_types = [self.tokenizer.get_type(str_type).Id] * len(tokens)
         return tokens, token_types
 
     def get_doc(self, idx):
@@ -510,7 +701,7 @@ def get_doc(self, idx):
             rtn = rtn['text']
         return rtn
 
-    def create_random_sentencepair(self, target_seq_length, rng):
+    def create_random_sentencepair(self, target_seq_length, rng, np_rng):
         """
         fetches a random sentencepair corresponding to rng state similar to
         https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
@@ -525,21 +716,26 @@ def create_random_sentencepair(self, target_seq_length, rng):
             curr_len = 0
             doc_a = None
             while doc_a is None:
-                doc_a_idx = rng.randint(0, self.ds_len-1)
+                if self.weighted:
+                    # doc_a_idx = np_rng.choice(self.ds_len, p=self.weighting)
+                    doc_a_idx = self.get_weighted_samples(np_rng)
+                else:
+                    doc_a_idx = rng.randint(0, self.ds_len - 1)
                 doc_a = self.sentence_split(self.get_doc(doc_a_idx))
                 if not doc_a:
                     doc_a = None
 
-            random_start_a = rng.randint(0, len(doc_a)-1)
+            random_start_a = rng.randint(0, len(doc_a) - 1)
             while random_start_a < len(doc_a):
                 sentence = doc_a[random_start_a]
-                sentence, sentence_types = self.sentence_tokenize(sentence, 0, random_start_a == 0, random_start_a == len(doc_a))
+                sentence, sentence_types = self.sentence_tokenize(
+                    sentence, 0, random_start_a == 0, random_start_a == len(doc_a))
                 curr_strs.append(sentence)
                 curr_str_types.append(sentence_types)
                 curr_len += len(sentence)
                 if random_start_a == len(doc_a) - 1 or curr_len >= target_seq_length:
                     break
-                random_start_a = (random_start_a+1)
+                random_start_a = (random_start_a + 1)
 
         if curr_strs:
             num_a = 1
@@ -569,16 +765,17 @@ def create_random_sentencepair(self, target_seq_length, rng):
                         if not doc_b:
                             doc_b = None
 
-                    random_start_b = rng.randint(0, len(doc_b)-1)
+                    random_start_b = rng.randint(0, len(doc_b) - 1)
                     while random_start_b < len(doc_b):
                         sentence_b = doc_b[random_start_b]
-                        new_b_tokens, new_b_types = self.sentence_tokenize(sentence_b, 1, random_start_b == 0, random_start_b == len(doc_b))
+                        new_b_tokens, new_b_types = self.sentence_tokenize(
+                            sentence_b, 1, random_start_b == 0, random_start_b == len(doc_b))
                         b_len += len(new_b_tokens)
                         tokens_b.extend(new_b_tokens)
                         token_types_b.extend(new_b_types)
                         if len(tokens_b) >= target_b_length:
                             break
-                        random_start_b = (random_start_b+1)
+                        random_start_b = (random_start_b + 1)
             else:
                 is_random_next = False
                 for j in range(num_a, len(curr_strs)):
@@ -594,7 +791,8 @@ def truncate_seq_pair(self, a, b, max_seq_len, rng):
         """
         tokens_a, token_types_a = a
         tokens_b, token_types_b = b
-        max_num_tokens = max_seq_len - 3
+        max_num_tokens = self.calc_seq_len(max_seq_len)
+        # max_num_tokens = max_seq_len - 3
         while True:
             len_a = len(tokens_a)
             len_b = len(tokens_b)
@@ -618,6 +816,9 @@ def truncate_seq_pair(self, a, b, max_seq_len, rng):
                 trunc_types.pop()
         return (tokens_a, token_types_a), (tokens_b, token_types_b)
 
+    def calc_seq_len(self, max_seq_len):
+        return max_seq_len - 3
+
     def mask_token(self, idx, tokens, types, vocab_words, rng):
         """
         helper function to mask `idx` token from `tokens` according to
@@ -639,10 +840,17 @@ def mask_token(self, idx, tokens, types, vocab_words, rng):
     def pad_seq(self, seq):
         """helper function to pad sequence pair"""
         num_pad = max(0, self.max_seq_len - len(seq))
-        pad_mask = [0] * len(seq) + [1] * num_pad 
+        pad_mask = [0] * len(seq) + [1] * num_pad
         seq += [self.tokenizer.get_command('pad').Id] * num_pad
         return seq, pad_mask
 
+    def concat_tokens(self, tokens_a, token_types_a, tokens_b, token_types_b):
+        tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command(
+            'sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id]
+        token_types = [token_types_a[0]] + token_types_a + \
+            [token_types_a[0]] + token_types_b + [token_types_b[0]]
+        return tokens, token_types
+
     def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng):
         """
         Mask sequence pair for BERT training according to:
@@ -650,13 +858,12 @@ def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vo
         """
         tokens_a, token_types_a = a
         tokens_b, token_types_b = b
-        tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id]
-        token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]]
+        tokens, token_types = self.concat_tokens(tokens_a, token_types_a, tokens_b, token_types_b)
 
         len_a = len(tokens_a)
         len_b = len(tokens_b)
 
-        cand_indices = [idx+1 for idx in range(len_a)] + [idx+2+len_a for idx in range(len_b)]
+        cand_indices = [idx + 1 for idx in range(len_a)] + [idx + 2 + len_a for idx in range(len_b)]
 
         rng.shuffle(cand_indices)
 
diff --git a/data_utils/file_utils.py b/megatron/deprecated_data_utils/file_utils.py
similarity index 99%
rename from data_utils/file_utils.py
rename to megatron/deprecated_data_utils/file_utils.py
index 44ecc417..4dc7fdcc 100755
--- a/data_utils/file_utils.py
+++ b/megatron/deprecated_data_utils/file_utils.py
@@ -169,7 +169,7 @@ def http_get(url, temp_file):
     total = int(content_length) if content_length is not None else None
     progress = tqdm(unit="B", total=total)
     for chunk in req.iter_content(chunk_size=1024):
-        if chunk: # filter out keep-alive new chunks
+        if chunk:  # filter out keep-alive new chunks
             progress.update(len(chunk))
             temp_file.write(chunk)
     progress.close()
diff --git a/data_utils/lazy_loader.py b/megatron/deprecated_data_utils/lazy_loader.py
similarity index 84%
rename from data_utils/lazy_loader.py
rename to megatron/deprecated_data_utils/lazy_loader.py
index 3645e7c8..506f5294 100644
--- a/data_utils/lazy_loader.py
+++ b/megatron/deprecated_data_utils/lazy_loader.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,15 +18,17 @@
 import pickle as pkl
 import time
 from itertools import accumulate
-from threading import Lock
 
 import torch
+from torch.multiprocessing import Lock
+
 
 def get_lazy_path(path):
     """
     Gets directory path where lazy files are stored.
     """
-    return os.path.splitext(path)[0]+'.lazy'
+    return os.path.splitext(path)[0] + '.lazy'
+
 
 def exists_lazy(path, data_type='data'):
     """
@@ -37,10 +39,11 @@ def exists_lazy(path, data_type='data'):
     contents = os.listdir(get_lazy_path(path))
     if data_type not in contents:
         return False
-    if data_type+'.len.pkl' not in contents:
+    if data_type + '.len.pkl' not in contents:
         return False
     return True
 
+
 def make_lazy(path, strs, data_type='data'):
     """
     Make lazy version of `data_type` field of the file. Byte offsets
@@ -50,7 +53,7 @@ def make_lazy(path, strs, data_type='data'):
     if not os.path.exists(lazypath):
         os.makedirs(lazypath)
     datapath = os.path.join(lazypath, data_type)
-    lenpath = os.path.join(lazypath, data_type+'.len.pkl')
+    lenpath = os.path.join(lazypath, data_type + '.len.pkl')
     if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
         with open(datapath, 'wb') as f:
             str_lens = []
@@ -67,28 +70,32 @@ def make_lazy(path, strs, data_type='data'):
         while not os.path.exists(lenpath):
             time.sleep(1)
 
+
 def split_strings(strings, start, chr_lens):
     """
     Split strings based on string lengths and given start.
     """
-    return [strings[i-start:j-start] for i, j in zip([start]+chr_lens[:-1], chr_lens)]
+    return [strings[i - start:j - start] for i, j in zip([start] + chr_lens[:-1], chr_lens)]
+
 
 class ProcessorTokenizer:
     """
     callable class that runs a preprocessing, as well as tokenization step,
     on input text.
     """
+
     def __init__(self, tokenizer, process_fn=None):
         self.tokenizer = tokenizer
         self.process_fn = process_fn
 
     def __call__(self, string):
         if self.tokenizer is not None:
-            string =  self.tokenizer(string, process_fn=self.process_fn)
+            string = self.tokenizer(string, process_fn=self.process_fn)
         elif self.process_fn is not None:
-            string =  self.process_fn(string)
+            string = self.process_fn(string)
         return string
 
+
 class lazy_array_loader(object):
     """
     Arguments:
@@ -107,17 +114,18 @@ class lazy_array_loader(object):
         data_type2
         data_type2.len.pkl
     """
+
     def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
         lazypath = get_lazy_path(path)
         datapath = os.path.join(lazypath, data_type)
-        #get file where array entries are concatenated into one big string
-        self._file = open(datapath, 'rb')
+        # get file where array entries are concatenated into one big string
+        self._file = open(datapath, 'rb', buffering=0)
         self.file = self._file
-        #memory map file if necessary
+        # memory map file if necessary
         self.mem_map = mem_map
         if self.mem_map:
             self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
-        lenpath = os.path.join(lazypath, data_type+'.len.pkl')
+        lenpath = os.path.join(lazypath, data_type + '.len.pkl')
         self.lens = pkl.load(open(lenpath, 'rb'))
         self.ends = list(accumulate(self.lens))
         self.dumb_ends = list(self.ends)
@@ -149,7 +157,7 @@ def __getitem__(self, index):
             if index == 0:
                 start = 0
             else:
-                start = self.ends[index-1]
+                start = self.ends[index - 1]
             end = self.ends[index]
             rtn = self.file_read(start, end)
             if self.map_fn is not None:
@@ -160,7 +168,7 @@ def __getitem__(self, index):
             if index.start == 0 or index.start is None:
                 start = 0
             else:
-                start = self.ends[index.start-1]
+                start = self.ends[index.start - 1]
             stop = chr_lens[-1]
             strings = self.file_read(start, stop)
             rtn = split_strings(strings, start, chr_lens)
@@ -181,15 +189,14 @@ def file_read(self, start=0, end=None):
         # read to end of file if no end point provided
         if end is None:
             rtn = self.file.read()
-        #else read amount needed to reach end point
+        # else read amount needed to reach end point
         else:
-            rtn = self.file.read(end-start)
+            rtn = self.file.read(end - start)
         self.read_lock.release()
-        #TODO: @raulp figure out mem map byte string bug
-        #if mem map'd need to decode byte string to string
-        rtn = rtn.decode('utf-8')
+        # TODO: @raulp figure out mem map byte string bug
+        # if mem map'd need to decode byte string to string
+        rtn = rtn.decode('utf-8', 'ignore')
         # rtn = str(rtn)
         if self.mem_map:
             rtn = rtn.decode('unicode_escape')
         return rtn
-
diff --git a/data_utils/samplers.py b/megatron/deprecated_data_utils/samplers.py
similarity index 56%
rename from data_utils/samplers.py
rename to megatron/deprecated_data_utils/samplers.py
index 4e086905..baa6b9d0 100644
--- a/data_utils/samplers.py
+++ b/megatron/deprecated_data_utils/samplers.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,15 +21,71 @@
 from torch.utils import data
 import numpy as np
 
+
+class RandomSampler(data.sampler.Sampler):
+    r"""
+    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
+    but this class lets the user set an epoch like DistributedSampler
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``, default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError("With replacement=False, num_samples should not be specified, "
+                             "since a random permute will be performed.")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer "
+                             "value, but got num_samples={}".format(self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError("replacement should be a boolean value, but got "
+                             "replacement={}".format(self.replacement))
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            return iter(torch.randint(high=n, size=(self.num_samples,),
+                                      dtype=torch.int64, generator=g).tolist())
+        return iter(torch.randperm(n, generator=g).tolist())
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
 class DistributedBatchSampler(data.sampler.BatchSampler):
     """
     similar to normal implementation of distributed sampler, except implementation is at the
     batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
     data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
     """
+
     def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False):
         super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
         if rank == -1:
+            assert False, 'should not be here'
             rank = torch.distributed.get_rank()
         self.rank = rank
         self.world_size = world_size
@@ -73,7 +129,7 @@ def __iter__(self):
     def data_iterator(self, _iter, wrap_around=False):
         """iterates through data and handles wrap around"""
         for i, idx in enumerate(_iter):
-            if i < self.wrap_around%self.batch_size:
+            if i < self.wrap_around % self.batch_size:
                 continue
             if wrap_around:
                 self.wrap_around += 1
@@ -82,6 +138,6 @@ def data_iterator(self, _iter, wrap_around=False):
 
     def _batch(self, batch):
         """extracts samples only pertaining to this worker's batch"""
-        start = self.rank*self.batch_size//self.world_size
-        end = (self.rank+1)*self.batch_size//self.world_size
-        return batch[start:end]
\ No newline at end of file
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
diff --git a/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py b/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
new file mode 100644
index 00000000..f150f2f6
--- /dev/null
+++ b/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
@@ -0,0 +1,27 @@
+"""
+Usage:
+python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
+"""
+
+import sys
+import json
+
+import nltk
+
+nltk.download('punkt')
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+line_seperator = "\n"
+
+with open(input_file, 'r') as ifile:
+    with open(output_file, "w") as ofile:
+        for doc in ifile.readlines():
+            parsed = json.loads(doc)
+            sent_list = []
+            for line in parsed['text'].split('\n'):
+                if line != '\n':
+                    sent_list.extend(nltk.tokenize.sent_tokenize(line))
+            parsed['text'] = line_seperator.join(sent_list)
+            ofile.write(json.dumps(parsed) + '\n')
diff --git a/megatron/deprecated_data_utils/scripts/split_gpt2_json.py b/megatron/deprecated_data_utils/scripts/split_gpt2_json.py
new file mode 100644
index 00000000..e6ddb1ba
--- /dev/null
+++ b/megatron/deprecated_data_utils/scripts/split_gpt2_json.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Takes a corpora of files (specified by `--input_files`) with json data separated
+by newlines (loose json). Splits data into train.json, val.json, test.json files
+under `output_dir`.
+
+Note: This code has the potential to override files with the names
+train.json, val.json, test.json in `--output_dir`.
+"""
+import os
+import argparse
+import math
+import random
+
+parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
+parser.add_argument('--input_files', nargs='+', required=True,
+                    help='whitespace separated list of input data files')
+parser.add_argument('--output_dir', required=True,
+                    help='output directory where to put files')
+parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
+                    help='percentage of available data to use for val/test dataset')
+args = parser.parse_args()
+
+
+def get_lines(filepath):
+    lines = []
+    with open(filepath, 'r') as f:
+        for i, l in enumerate(f.readlines()):
+            l = l.strip()
+            lines.append(l)
+    return lines
+
+
+def get_splits(lines, line_counts):
+    all_lines = []
+    line_idx = []
+    file_mappings = []
+    for i, l in enumerate(lines):
+        all_lines.extend(l)
+        line_idx.extend(list(range(len(l))))
+        file_mappings.extend([i] * len(l))
+
+    indices = list(range(len(all_lines)))
+    random.shuffle(indices)
+    all_lines = [all_lines[idx] for idx in indices]
+    line_idx = [line_idx[idx] for idx in indices]
+    file_mappings = [file_mappings[idx] for idx in indices]
+
+    splits = []
+    mappings = []
+    start = 0
+    for end in line_counts:
+        end += start
+        splits.append(all_lines[start:end])
+        mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
+        start = end
+    return splits, mappings
+
+
+def format_mappings(line_idx, file_mappings):
+    lines = []
+    for m, l in zip(file_mappings, line_idx):
+        lines.append(str(m).strip() + '\t' + str(l).strip())
+    return lines
+
+
+def get_filepaths(filepaths, output_dir):
+    paths = []
+    train_path = 'train.json'
+    dev_path = 'dev.json'
+    test_path = 'test.json'
+    paths.append(os.path.join(output_dir, train_path))
+    paths.append(os.path.join(output_dir, dev_path))
+    paths.append(os.path.join(output_dir, test_path))
+    return paths
+
+
+def write_files(lines, mappings, filepaths):
+    for l, m, path in zip(lines, mappings, filepaths):
+        write_file(l, path)
+        write_mapping_file(m, path)
+
+
+def write_file(lines, path):
+    print('Writing:', path)
+    with open(path, 'w') as f:
+        for l in lines:
+            f.write(l + '\n')
+
+
+def write_mapping_file(m, path):
+    path = path + '.map'
+    m = [get_mapping_header()] + m
+    write_file(m, path)
+
+
+def get_mapping_header():
+    return 'file\tline #'
+
+
+if not os.path.exists(args.output_dir):
+    os.makedirs(args.output_dir)
+
+lines = []
+
+for filepath in args.input_files:
+    _lines = get_lines(filepath)
+    lines.append(_lines)
+
+# calculate number of lines to use for each
+line_counts = [len(l) for l in lines]
+total_lines = sum(line_counts)
+dev_percent = args.test_percent[0]
+dev_lines = math.ceil(dev_percent * total_lines)
+test_percent = 0
+if len(args.test_percent) == 2:
+    test_percent = args.test_percent[1]
+test_lines = math.ceil(test_percent * total_lines)
+train_lines = total_lines - (test_lines + dev_lines)
+normed_lines = [train_lines, dev_lines, test_lines]
+normed_lines = [int(l) for l in normed_lines]
+
+
+splits, mappings = get_splits(lines, normed_lines)
+filepaths = get_filepaths(args.input_files, args.output_dir)
+print('Writing output to:', filepaths)
+write_files(splits, mappings, filepaths)
diff --git a/megatron/deprecated_data_utils/scripts/split_json.py b/megatron/deprecated_data_utils/scripts/split_json.py
new file mode 100644
index 00000000..7d2958ca
--- /dev/null
+++ b/megatron/deprecated_data_utils/scripts/split_json.py
@@ -0,0 +1,126 @@
+"""
+Takes a corpora of files (specified by `--input_files`) with json data separated
+by newlines (loose json). Splits data into train.json, val.json, test.json files
+under `output_dir`.
+
+Note: This code has the potential to override files with the names
+train.json, val.json, test.json in `--output_dir`.
+"""
+import os
+import argparse
+import math
+import random
+
+parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
+parser.add_argument('--input_files', nargs='+', required=True,
+                    help='whitespace separated list of input data files')
+parser.add_argument('--output_dir', required=True,
+                    help='output directory where to put files')
+parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
+                    help='percentage of available data to use for val/test dataset')
+args = parser.parse_args()
+
+
+def get_lines(filepath):
+    lines = []
+    with open(filepath, 'r') as f:
+        for i, l in enumerate(f.readlines()):
+            l = l.strip()
+            lines.append(l)
+    return lines
+
+
+def get_splits(lines, line_counts):
+    all_lines = []
+    line_idx = []
+    file_mappings = []
+    for i, l in enumerate(lines):
+        all_lines.extend(l)
+        line_idx.extend(list(range(len(l))))
+        file_mappings.extend([i] * len(l))
+
+    indices = list(range(len(all_lines)))
+    random.shuffle(indices)
+    all_lines = [all_lines[idx] for idx in indices]
+    line_idx = [line_idx[idx] for idx in indices]
+    file_mappings = [file_mappings[idx] for idx in indices]
+
+    splits = []
+    mappings = []
+    start = 0
+    for end in line_counts:
+        end += start
+        splits.append(all_lines[start:end])
+        mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
+        start = end
+    return splits, mappings
+
+
+def format_mappings(line_idx, file_mappings):
+    lines = []
+    for m, l in zip(file_mappings, line_idx):
+        lines.append(str(m).strip() + '\t' + str(l).strip())
+    return lines
+
+
+def get_filepaths(filepaths, output_dir):
+    paths = []
+    train_path = 'train.json'
+    dev_path = 'dev.json'
+    test_path = 'test.json'
+    paths.append(os.path.join(output_dir, train_path))
+    paths.append(os.path.join(output_dir, dev_path))
+    paths.append(os.path.join(output_dir, test_path))
+    return paths
+
+
+def write_files(lines, mappings, filepaths):
+    for l, m, path in zip(lines, mappings, filepaths):
+        write_file(l, path)
+        write_mapping_file(m, path)
+
+
+def write_file(lines, path):
+    print('Writing:', path)
+    with open(path, 'w') as f:
+        for l in lines:
+            f.write(l + '\n')
+
+
+def write_mapping_file(m, path):
+    path = path + '.map'
+    m = [get_mapping_header()] + m
+    write_file(m, path)
+
+
+def get_mapping_header():
+    return 'file\tline #'
+
+
+if not os.path.exists(args.output_dir):
+    os.makedirs(args.output_dir)
+
+lines = []
+
+for filepath in args.input_files:
+    _lines = get_lines(filepath)
+    lines.append(_lines)
+
+# calculate number of lines to use for each
+line_counts = [len(l) for l in lines]
+total_lines = sum(line_counts)
+dev_percent = args.test_percent[0]
+dev_lines = math.ceil(dev_percent * total_lines)
+test_percent = 0
+if len(args.test_percent) == 2:
+    test_percent = args.test_percent[1]
+test_lines = math.ceil(test_percent * total_lines)
+train_lines = total_lines - (test_lines + dev_lines)
+normed_lines = [train_lines, dev_lines, test_lines]
+normed_lines = [int(l) for l in normed_lines]
+
+
+splits, mappings = get_splits(lines, normed_lines)
+filepaths = get_filepaths(args.input_files, args.output_dir)
+print('Writing output to:', filepaths)
+write_files(splits, mappings, filepaths)
diff --git a/data_utils/tf_dl.py b/megatron/deprecated_data_utils/tf_dl.py
similarity index 59%
rename from data_utils/tf_dl.py
rename to megatron/deprecated_data_utils/tf_dl.py
index a29376f0..7d93ab0c 100755
--- a/data_utils/tf_dl.py
+++ b/megatron/deprecated_data_utils/tf_dl.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,16 +14,22 @@
 # limitations under the License.
 """PyTorch DataLoader for TFRecords"""
 
+import numpy as np
+import torch
+import queue
+import threading
+
 import tensorflow as tf
 tf.enable_eager_execution()
-import torch
+
 
 class TFRecordDataLoader(object):
-    def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq, train, num_workers=2, seed=1):
+    def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq,
+                 train, num_workers=2, seed=1, threaded_dl=False):
         assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords"
         tf.set_random_seed(seed)
         if isinstance(records, str):
-            records  = [records]
+            records = [records]
 
         self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
                                                 "input_mask": tf.FixedLenFeature([max_seq_len], tf.int64),
@@ -33,7 +39,7 @@ def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq, train, n
                                                 "masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32),
                                                 "next_sentence_labels": tf.FixedLenFeature([1], tf.int64)})
 
-        #Instantiate dataset according to original BERT implementation
+        # Instantiate dataset according to original BERT implementation
         if train:
             self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records))
             self.dataset = self.dataset.repeat()
@@ -51,15 +57,25 @@ def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq, train, n
             self.dataset = self.dataset.repeat()
 
         # Instantiate dataloader (do not drop remainder for eval)
-        loader_args = {'batch_size': batch_size, 
+        loader_args = {'batch_size': batch_size,
                        'num_parallel_batches': num_workers,
                        'drop_remainder': train}
-        self.dataloader = self.dataset.apply(tf.contrib.data.map_and_batch(self.record_converter, **loader_args))
+        self.dataloader = self.dataset.apply(
+            tf.contrib.data.map_and_batch(
+                self.record_converter, **loader_args))
+        self.threaded_dl = threaded_dl
+        self.num_workers = num_workers
 
     def __iter__(self):
-        data_iter = iter(self.dataloader)
-        for item in data_iter:
-            yield convert_tf_example_to_torch_tensors(item)
+        if self.threaded_dl:
+            data_iter = iter(MultiprocessLoader(self.dataloader, self.num_workers))
+            for item in data_iter:
+                yield item
+        else:
+            data_iter = iter(self.dataloader)
+            for item in data_iter:
+                yield convert_tf_example_to_torch_tensors(item)
+
 
 class Record2Example(object):
     def __init__(self, feature_map):
@@ -73,15 +89,41 @@ def __call__(self, record):
                 example[k] = tf.to_int32(v)
         return example
 
+
 def convert_tf_example_to_torch_tensors(example):
-    item = {k: torch.from_numpy(v.numpy()) for k,v in example.items()}
-    mask = torch.zeros_like(item['input_ids'])
-    mask_labels = torch.ones_like(item['input_ids'])*-1
-    for b, row in enumerate(item['masked_lm_positions'].long()):
+    item = {k: (v.numpy()) for k, v in example.items()}
+    mask = np.zeros_like(item['input_ids'])
+    mask_labels = np.ones_like(item['input_ids']) * -1
+    for b, row in enumerate(item['masked_lm_positions'].astype(int)):
         for i, idx in enumerate(row):
             if item['masked_lm_weights'][b, i] != 0:
                 mask[b, idx] = 1
                 mask_labels[b, idx] = item['masked_lm_ids'][b, i]
-    return {'text': item['input_ids'], 'types': item['segment_ids'],'is_random': item['next_sentence_labels'],
-            'pad_mask': 1-item['input_mask'], 'mask': mask, 'mask_labels': mask_labels}  
+    output = {'text': item['input_ids'], 'types': item['segment_ids'], 'is_random': item['next_sentence_labels'],
+              'pad_mask': 1 - item['input_mask'], 'mask': mask, 'mask_labels': mask_labels}
+    return {k: torch.from_numpy(v) for k, v in output.items()}
+
+
+class MultiprocessLoader(object):
+    def __init__(self, dataloader, num_workers=2):
+        self.dl = dataloader
+        self.queue_size = 2 * num_workers
+
+    def __iter__(self):
+        output_queue = queue.Queue(self.queue_size)
+        output_thread = threading.Thread(target=_multiproc_iter,
+                                         args=(self.dl, output_queue))
+        output_thread.daemon = True
+        output_thread.start()
+
+        while output_thread.is_alive():
+            yield output_queue.get(block=True)
+        else:
+            print(RuntimeError('TF record data loader thread exited unexpectedly'))
+
 
+def _multiproc_iter(dl, output_queue):
+    data_iter = iter(dl)
+    for item in data_iter:
+        tensors = convert_tf_example_to_torch_tensors(item)
+        output_queue.put(tensors, block=True)
diff --git a/data_utils/tokenization.py b/megatron/deprecated_data_utils/tokenization.py
similarity index 74%
rename from data_utils/tokenization.py
rename to megatron/deprecated_data_utils/tokenization.py
index 87f7f9c4..c37e6f1d 100755
--- a/data_utils/tokenization.py
+++ b/megatron/deprecated_data_utils/tokenization.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,15 +17,20 @@
 import random
 import os
 import csv
+import torch
 
 import nltk
-nltk.download('punkt')
 from nltk import tokenize as nltk_tokenize
 import sentencepiece as spm
 
 from .wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
-def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs):
+from .tokenization_gpt2 import GPT2Tokenizer
+import regex as re
+
+
+def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe',
+                   pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs):
     """
     Helper function to instantiate a tokenizer given common combinations of options.
     """
@@ -34,10 +39,13 @@ def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, mod
         tokenizer_class = eval(tokenizer_class)
     if tokenizer_class is BertWordPieceTokenizer:
         return BertWordPieceTokenizer(model_type, **kwargs)
-    text_tokenizer =  tokenizer_class(corpus=corpus, vocab_size=vocab_size, model_path=model_path, model_type=model_type,
-                                      pad_token=pad_token, character_coverage=character_coverage)
+    elif tokenizer_class is GPT2BPETokenizer:
+        return GPT2BPETokenizer(**kwargs)
+    text_tokenizer = tokenizer_class(corpus=corpus, vocab_size=vocab_size, model_path=model_path, model_type=model_type,
+                                     pad_token=pad_token, character_coverage=character_coverage)
     return Tokenizer(text_tokenizer, command_tokens, type_tokens)
 
+
 class Tokenization(object):
     """
     Tokenization object to hold tokenization, (processed text),and original
@@ -49,7 +57,9 @@ class Tokenization(object):
 
     Several standard array operations are implemented (insert, append, extend).
     """
-    def __init__(self, tokenization, text=None, original_text=None, command_tokens=None, asIds=True):
+
+    def __init__(self, tokenization, text=None, original_text=None,
+                 command_tokens=None, asIds=True):
         self.tokenization = tokenization
         self.text = text
         if self.text is None:
@@ -84,21 +94,23 @@ def insert(self, idx, other):
         if isinstance(other, (CommandToken, TypeToken)):
             self.tokenization.insert(idx, other.Id)
             if idx == 0:
-                self.text.insert(0, other.token)
-                self.original_text.insert(0, other.token)
-            elif idx == len(self.tokenization)-1:
-                self.text.insert(-1, other.token)
-                self.original_text.insert(-1, other.token)
+                self.text = other.token + self.text
+                self.original_text = other.token + self.original_text
+            elif idx == len(self.tokenization) - 1:
+                self.text += other.token
+                self.original_text += other.token
         elif isinstance(other, Tokenization):
-            self.tokenization = self.tokenization[:idx] + other.tokenization + self.tokenization[idx:]
+            self.tokenization = self.tokenization[:idx] + \
+                other.tokenization + self.tokenization[idx:]
         else:
-            self.tokenization = self.tokenization[:idx] + other.tokenization + self.tokenization[idx:]
+            self.tokenization = self.tokenization[:idx] + \
+                other.tokenization + self.tokenization[idx:]
 
     def append(self, other):
         if isinstance(other, (CommandToken, TypeToken)):
             self.tokenization.append(other.Id)
-            self.text.append(other.token)
-            self.original_text.append(other.token)
+            self.text += other.token
+            self.original_text += other.token
         elif isinstance(other, Tokenization):
             self.tokenization.extend(other.tokenization)
             self.text += other.text
@@ -110,8 +122,8 @@ def append(self, other):
     def extend(self, other):
         if isinstance(other, (CommandToken, TypeToken)):
             self.tokenization.append(other.Id)
-            self.text.append(other.token)
-            self.original_text.append(other.token)
+            self.text += other.token
+            self.original_text += other.token
         elif isinstance(other, list) and isinstance(other[0], (CommandToken, TypeToken)):
             self.tokenization.extend([o.Id for o in other])
             self.text += [o.token for o in other]
@@ -124,14 +136,17 @@ def extend(self, other):
             self.tokenization.extend(other)
         return self
 
+
 """define some default command tokens for the tokenizer to use"""
 token_format = "<{0}>"
 
 COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id'))
 
+
 def prep_command_tokens(tokenlist, token_format=token_format):
     return [CommandToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist]
 
+
 class CommandToken(object):
     def __init__(self, name, token, Id):
         self.name = name
@@ -141,15 +156,16 @@ def __init__(self, name, token, Id):
     def __str__(self):
         return str(COMMAND_TUPLE(self.name, self.token, self.Id))
 
+
 DEFAULT_COMMAND_TOKENS = [
-                            ('pad', 0),
-                            ('eos', 1),
-                            ('bos', 2),
-                            ('unk', 3),
-                            ('sep', 4),
-                            ('L2R', 5),
-                            ('ENC', 6),
-                            ('MASK', 7),
+    ('pad', 0),
+    ('eos', 1),
+    ('bos', 2),
+    ('unk', 3),
+    ('sep', 4),
+    ('L2R', 5),
+    ('ENC', 6),
+    ('MASK', 7),
 ]
 DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
 
@@ -157,9 +173,11 @@ def __str__(self):
 
 TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id'))
 
+
 def prep_type_tokens(tokenlist, token_format=token_format):
     return [TypeToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist]
 
+
 class TypeToken(object):
     def __init__(self, name, token, Id):
         self.name = name
@@ -169,21 +187,23 @@ def __init__(self, name, token, Id):
     def __str__(self):
         return str(TYPE_TUPLE(self.name, self.token, self.Id))
 
+
 DEFAULT_TYPE_TOKENS = [
-                            ('function', 0),
-                            ('command', 1),
-                            ('str0', 2),
-                            ('str1', 3),
-                            ('str2', 4),
-                            ('embedding0', 5),
-                            ('embedding1', 6),
-                            ('embedding2', 7),
-                            ('arg0', 8),
-                            ('arg1', 9),
-                            ('arg2', 10),
+    ('function', 0),
+    ('command', 1),
+    ('str0', 2),
+    ('str1', 3),
+    ('str2', 4),
+    ('embedding0', 5),
+    ('embedding1', 6),
+    ('embedding2', 7),
+    ('arg0', 8),
+    ('arg1', 9),
+    ('arg2', 10),
 ]
 DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
 
+
 class Tokenizer(object):
     """
     Tokenizer object that handles text tokenization, command tokens, and type tokens.
@@ -194,6 +214,7 @@ class Tokenizer(object):
 
     Token types are stored in a separate mapping of size `len(type_tokens)`.
     """
+
     def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
         # set text tokenizer
         self.text_tokenizer = text_tokenizer
@@ -224,18 +245,20 @@ def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
 
         # parse tokens and vocabs from tokenizer
         self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens)
-        self._vocab = {t:Id for Id,t in self.command_id_map.items()}
-        self._vocab.update({t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()})
+        self._vocab = {t: Id for Id, t in self.command_id_map.items()}
+        self._vocab.update({t: Id + self.num_command_tokens for t,
+                            Id in self.text_tokenizer.vocab.items()})
 
         self._text_tokens = list(self.text_tokenizer.tokens)
-        self._text_token_vocab = {t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()}
+        self._text_token_vocab = {
+            t: Id + self.num_command_tokens for t,
+            Id in self.text_tokenizer.vocab.items()}
 
         self._command_token_tokens = list(self.command_token_map.keys())
-        self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}
+        self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
 
         self._token_types = list(self.type_token_map.keys())
-        self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}
-
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
 
     def __call__(self, text, process_fn=None):
         """run preprocessing and encode text as Ids"""
@@ -298,7 +321,7 @@ def EncodeAsIds(self, text, process_fn=None):
         encode text using text tokenizer and shift Id values for command tokens
         """
         tokenization = self.text_tokenizer.EncodeAsIds(text, process_fn=process_fn)
-        tokenization.tokenization = [t+self.num_command_tokens for t in tokenization.tokenization]
+        tokenization.tokenization = [t + self.num_command_tokens for t in tokenization.tokenization]
         tokenization.set_command_tokens(self._command_tokens)
         return tokenization
 
@@ -318,7 +341,7 @@ def IdToToken(self, Id, type_token=False):
             return self.type_id_map[Id].token
         if Id < self.num_command_tokens:
             return self.command_id_map[Id].token
-        return self.text_tokenizer.IdToToken(Id-self.num_command_tokens)
+        return self.text_tokenizer.IdToToken(Id - self.num_command_tokens)
 
     def TokenToId(self, token, type_token=False):
         """convert token to Id accounting for command and type tokens"""
@@ -328,7 +351,7 @@ def TokenToId(self, token, type_token=False):
             return self.type_token_map[token].Id
         if token in self.command_token_map:
             return self.command_token_map[token].Id
-        return self.text_tokenizer.TokenToId(token)+self.num_command_tokens
+        return self.text_tokenizer.TokenToId(token) + self.num_command_tokens
 
     def DecodeIds(self, Ids, type_token=False):
         """
@@ -336,7 +359,8 @@ def DecodeIds(self, Ids, type_token=False):
         are joined and returned as a string.
         """
         if type_token:
-            return ' '.join(Id.token if isinstance(Id, TypeToken) else self.type_id_map[Id].token for Id in Ids)
+            return ' '.join(Id.token if isinstance(Id, TypeToken)
+                            else self.type_id_map[Id].token for Id in Ids)
         rtn_strs = []
         current_str = []
         if isinstance(Ids, Tokenization):
@@ -381,10 +405,12 @@ def DecodeTokens(self, Tokens, type_token=False):
             rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
         return ' '.join(rtn_strs)
 
+
 class TextTokenizer(object):
     """
     Interface for text tokenizer
     """
+
     def __init__(self):
         if not hasattr(self, 'num_text_tokens'):
             self.num_text_tokens = 0
@@ -445,17 +471,18 @@ def DecodeIds(self, Ids):
     def DecodeTokens(self, Tokens):
         """Convert a list or tokenization object of tokens to a text string"""
         raise NotImplementedError('TextTokenizer DecodeTokens not implemented')
-        
+
 
 class CharacterLevelTokenizer(TextTokenizer):
     """
     Text tokenizer for ASCII-256 Character Level Tokenization.
     """
+
     def __init__(self, **kwargs):
         self.num_text_tokens = 256
         super(CharacterLevelTokenizer, self).__init__()
         self._tokens = [self.IdToToken(Id) for Id in range(self.num_text_tokens)]
-        self._vocab = {t: i for i,t in enumerate(self._tokens)}
+        self._vocab = {t: i for i, t in enumerate(self._tokens)}
 
     def __len__(self):
         return 256
@@ -516,12 +543,14 @@ def DecodeTokens(self, Tokens):
 
 MAX_SENTENCEPIECE_SENTENCES = 100000000
 
+
 def get_corpus_freq(dataset, filepath, filetype='tsv'):
     """
     Take corpus, split it into sentences, and extract word frequencies.
     Write frequencies to `filepath` as a tsv. Only write the first
     MAX_SENTENCEPIECE_SENTENCES most common words to the file.
     """
+    nltk.download('punkt', download_dir="./nltk")
     if filetype == 'tsv':
         delimiter = '\t'
     else:
@@ -550,14 +579,13 @@ def get_corpus_freq(dataset, filepath, filetype='tsv'):
     print("file path for freq " + str(filepath), flush=True)
 
     freqs_sorted = {}
-    counter=0
+    counter = 0
     for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True):
         if counter >= MAX_SENTENCEPIECE_SENTENCES:
             break
-        counter+=1
+        counter += 1
         freqs_sorted[word] = count
 
- 
     print("length of freqs after trancating " + str(len(freqs_sorted)), flush=True)
 
     with open(filepath, 'w') as f:
@@ -567,9 +595,12 @@ def get_corpus_freq(dataset, filepath, filetype='tsv'):
 
     return total_sentence_count, maxlen
 
+
 class SentencePieceTokenizer(TextTokenizer):
     """Trains and uses sentencepiece for text tokenization"""
-    def __init__(self, model_type='bpe', vocab_size=None, corpus=None, model_path=None, character_coverage=1.0, **kwargs):
+
+    def __init__(self, model_type='bpe', vocab_size=None, corpus=None,
+                 model_path=None, character_coverage=1.0, **kwargs):
         self.character_coverage = character_coverage
         self.model_type = model_type.lower()
         self.spm_model = model_path
@@ -602,18 +633,18 @@ def exists(model_path):
         dne = not os.path.exists(model_path)
         # check if path.model exists
         if dne and not model_path.endswith('.model'):
-            dne = not os.path.exists(model_path+'.model')
+            dne = not os.path.exists(model_path + '.model')
         return not dne
 
     def load_spm_model(self):
         """load sentencepiece model and parse vocab"""
         if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
-            self.spm_model = self.spm_model+'.model'
+            self.spm_model = self.spm_model + '.model'
         self.sp = spm.SentencePieceProcessor()
         self.sp.Load(self.spm_model)
         self.vocab_size = self.num_text_tokens = len(self.sp)
         self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
-        self._vocab = {t: i for i,t in enumerate(self._tokens)}
+        self._vocab = {t: i for i, t in enumerate(self._tokens)}
 
     def Train(self, corpus, num_text_tokens):
         """train sentencepiece model on corpus using word frequencies"""
@@ -624,7 +655,7 @@ def Train(self, corpus, num_text_tokens):
             use_model_path = random_hash
         if use_model_path.endswith('.model'):
             use_model_path = use_model_path[:use_model_path.rfind('.model')]
-        input_path = use_model_path+'.tsv.'+random_hash
+        input_path = use_model_path + '.tsv.' + random_hash
         line_count, maxlenline = get_corpus_freq(corpus, input_path)
         line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
         print('line count used as input_sentence_size ', line_count, flush=True)
@@ -634,13 +665,13 @@ def Train(self, corpus, num_text_tokens):
             + '--input_sentence_size={input_sentence_size} ' \
             + '--input_format=tsv'
         train_string = train_string.format(file_path=input_path, model_prefix=use_model_path, vocab_size=num_text_tokens,
-                            model_type=self.model_type, character_coverage=self.character_coverage, 
-                            input_sentence_size=int(line_count)) #, #)#,
-        print("calling spm.SentencePieceTrainer.Train(%s)"%(train_string), flush=True)
+                                           model_type=self.model_type, character_coverage=self.character_coverage,
+                                           input_sentence_size=int(line_count))  # , #)#,
+        print("calling spm.SentencePieceTrainer.Train(%s)" % (train_string), flush=True)
         spm.SentencePieceTrainer.Train(train_string)
         os.remove(input_path)
-        self.spm_model = use_model_path+'.model'
-        print('sentencepiece model written to '+self.spm_model, flush=True)
+        self.spm_model = use_model_path + '.model'
+        print('sentencepiece model written to ' + self.spm_model, flush=True)
 
     def EncodeAsIds(self, text, process_fn=None):
         """convert text to sentencepiece Ids"""
@@ -678,26 +709,35 @@ def DecodeTokens(self, Tokens):
             Tokens = Tokens.tokenization
         return self.sp.DecodeTokens(Tokens)
 
+
 class BertWordPieceTokenizer(Tokenizer):
     """
     Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
     in BERT training. Default to bert-large-uncased tokenizer.
     """
+
     def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs):
         # default to bert-large-uncased tokenizer
         if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP:
             tokenizer_model_type = 'bert-large-uncased'
-        print('loading BertWordPieceTokenizer (', tokenizer_model_type, ') from cache_dir ', cache_dir)
+        if torch.distributed.get_rank() == 0:
+            print(
+                'loading BertWordPieceTokenizer (',
+                tokenizer_model_type,
+                ') from cache_dir ',
+                cache_dir)
         do_lower_case = not ('-cased' in tokenizer_model_type or 'chinese' in tokenizer_model_type)
-        self.text_tokenizer = BertTokenizer.from_pretrained(tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir)
-        print('loaded', tokenizer_model_type)
+        self.text_tokenizer = BertTokenizer.from_pretrained(
+            tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir)
+        if torch.distributed.get_rank() == 0:
+            print('loaded', tokenizer_model_type)
         # disable max len warnings by increasing max len
         self.text_tokenizer.max_len = int(1e12)
 
         # set command tokens from wordpiece tokenizer values
         self.num_command_tokens = 5
         self.num_tokens = len(self.text_tokenizer.vocab)
-        self.num_text_tokens = self.num_tokens-5
+        self.num_text_tokens = self.num_tokens - 5
         self.num_type_tokens = 2
 
         self._command_tokens = [
@@ -723,16 +763,16 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs):
         # parse tokens and vocabs from tokenizer
 
         self._tokens = list(self.text_tokenizer.vocab.keys())
-        self._vocab = {k:v for k,v in self.text_tokenizer.vocab.items()}
+        self._vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
 
         self._text_tokens = list(self._tokens)
-        self._text_token_vocab = {k:v for k,v in self.text_tokenizer.vocab.items()}
+        self._text_token_vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
 
         self._command_token_tokens = list(self.command_token_map.keys())
-        self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}
+        self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
 
         self._token_types = list(self.type_token_map.keys())
-        self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
 
     def EncodeAsIds(self, text, process_fn=None):
         """convert text to wordpiece Ids"""
@@ -770,7 +810,8 @@ def TokenToId(self, token, type_token=False):
     def DecodeIds(self, Ids, type_token=False):
         """converts ids to wordpiece tokens and joins them as a text string"""
         if type_token:
-            return ' '.join(Id.token if isinstance(Id, TypeToken) else self.type_id_map[Id].token for Id in Ids)
+            return ' '.join(Id.token if isinstance(Id, TypeToken)
+                            else self.type_id_map[Id].token for Id in Ids)
         if isinstance(Ids, Tokenization):
             Ids = Ids.tokenization
         Tokens = []
@@ -786,3 +827,96 @@ def DecodeTokens(self, Tokens, type_token=False):
         if isinstance(Tokens, Tokenization):
             Tokens = Tokens.tokenization
         return ' '.join(Tokens)
+
+
+class GPT2BPETokenizer(Tokenizer):
+    def __init__(self, cache_dir=None, **kwargs):
+        self.text_tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
+                                                            cache_dir=cache_dir)
+
+        # disable max len warnings by increasing max len
+        self.text_tokenizer.max_len = int(1e12)
+        self.num_command_tokens = 2
+        self.num_tokens = len(self.text_tokenizer.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.num_type_tokens = 2
+
+        self._command_tokens = [
+            CommandToken('pad', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
+            CommandToken('eos', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
+        ]
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {tok.token: tok for tok in self._command_tokens}
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        self._tokens = list(self.text_tokenizer.encoder.keys())
+        self._vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
+
+        self._text_tokens = list(self._tokens)
+        self._text_token_vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def EncodeAsIds(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        Ids = self.text_tokenizer.encode(processed_text)
+        # return Tokenization(Ids, processed_text, text)
+        tokenization = Tokenization(Ids, processed_text, text)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = []
+        for token in re.findall(self.text_tokenizer.pat, processed_text):
+            token = ''.join(self.text_tokenizer.bye_encoder[b] for b in token.encode('utf-8'))
+            tokens.extend(bpe_token for bpe_token in self.text_tokenizer.bpe(token).split(' '))
+        tokenization = Tokenization(tokens, processed_text, text, asIds=False)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+        # return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id, type_token=False):
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        return self.text_tokenizer.decoder[Id]
+
+    def TokenToId(self, token, type_token=False):
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.encoder[token]
+
+    def DecodeIds(self, Ids, type_token=False):
+        if type_token:
+            return ' '.join(Id.token if isinstance(Id, TypeToken)
+                            else self.type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return self.text_tokenizer.decode(Ids)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        if type_token:
+            return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.text_tokenizer.decode([self.TokenToId(tok) for tok in Tokens])
diff --git a/megatron/deprecated_data_utils/tokenization_gpt2.py b/megatron/deprecated_data_utils/tokenization_gpt2.py
new file mode 100644
index 00000000..700fc996
--- /dev/null
+++ b/megatron/deprecated_data_utils/tokenization_gpt2.py
@@ -0,0 +1,319 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE
+    # tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
+        list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except BaseException:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(
+                    len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/data_utils/wordpiece.py b/megatron/deprecated_data_utils/wordpiece.py
similarity index 96%
rename from data_utils/wordpiece.py
rename to megatron/deprecated_data_utils/wordpiece.py
index 81121e47..cc20443a 100755
--- a/data_utils/wordpiece.py
+++ b/megatron/deprecated_data_utils/wordpiece.py
@@ -99,19 +99,19 @@ def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokeni
             [(ids, tok) for tok, ids in self.vocab.items()])
         self.do_basic_tokenize = do_basic_tokenize
         if do_basic_tokenize:
-          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                never_split=never_split)
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                  never_split=never_split)
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
         self.max_len = max_len if max_len is not None else int(1e12)
 
     def tokenize(self, text):
         if self.do_basic_tokenize:
-          split_tokens = []
-          for token in self.basic_tokenizer.tokenize(text):
-              for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                  split_tokens.append(sub_token)
+            split_tokens = []
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
         else:
-          split_tokens = self.wordpiece_tokenizer.tokenize(text)
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
         return split_tokens
 
     def convert_tokens_to_ids(self, tokens):
@@ -123,7 +123,8 @@ def convert_tokens_to_ids(self, tokens):
             logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
                 " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+                " sequence through BERT will result in indexing errors".format(
+                    len(ids), self.max_len)
             )
         return ids
 
diff --git a/fp16/__init__.py b/megatron/fp16/__init__.py
similarity index 93%
rename from fp16/__init__.py
rename to megatron/fp16/__init__.py
index a2c68a1f..56ee11f7 100644
--- a/fp16/__init__.py
+++ b/megatron/fp16/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/fp16/fp16.py b/megatron/fp16/fp16.py
similarity index 88%
rename from fp16/fp16.py
rename to megatron/fp16/fp16.py
index f292de06..bdea6adb 100755
--- a/fp16/fp16.py
+++ b/megatron/fp16/fp16.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,18 +22,25 @@
 from .loss_scaler import DynamicLossScaler, LossScaler
 from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
 
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from megatron.module import MegatronModule
+
 FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
 HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
 
+
 def conversion_helper(val, conversion):
     """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
     if not isinstance(val, (tuple, list)):
         return conversion(val)
-    rtn =  [conversion_helper(v, conversion) for v in val]
+    rtn = [conversion_helper(v, conversion) for v in val]
     if isinstance(val, tuple):
         rtn = tuple(rtn)
     return rtn
 
+
 def fp32_to_fp16(val):
     """Convert fp32 `val` to fp16"""
     def half_conversion(val):
@@ -45,6 +52,7 @@ def half_conversion(val):
         return val
     return conversion_helper(val, half_conversion)
 
+
 def fp16_to_fp32(val):
     """Convert fp16 `val` to fp32"""
     def float_conversion(val):
@@ -56,7 +64,8 @@ def float_conversion(val):
         return val
     return conversion_helper(val, float_conversion)
 
-class FP16_Module(nn.Module):
+
+class FP16_Module(MegatronModule):
     def __init__(self, module):
         super(FP16_Module, self).__init__()
         self.add_module('module', module.half())
@@ -67,13 +76,20 @@ def forward(self, *inputs, **kwargs):
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         return self.module.state_dict(destination, prefix, keep_vars)
 
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+
     def load_state_dict(self, state_dict, strict=True):
         self.module.load_state_dict(state_dict, strict=strict)
 
 # TODO:  Update overflow check + downscale to use Carl's fused kernel.
+
+
 class FP16_Optimizer(object):
     """
-    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, 
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
     and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
     For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
     and changing the call to ``backward``.
@@ -96,45 +112,45 @@ class FP16_Optimizer(object):
         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
                                    # optional arg to control dynamic loss scaling behavior
                                    # dynamic_loss_args={'scale_window' : 500})
-                                   # Usually, dynamic_loss_args is not necessary. 
+                                   # Usually, dynamic_loss_args is not necessary.
 
     Args:
-        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
         static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
         dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
         dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
         verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
 
-    ``init_optimizer`` is expected to have been constructed in the ordinary way.  
-    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be 
-    named to replace ``init_optimizer``, for two reasons:  
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
     First, it means that references to the same name
-    later in the file will not have to change.  
-    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to 
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
     modify ``init_optimizer``.  If you do choose a unique name for the new
     :class:`FP16_Optimizer` instance, you should only work with this new instance,
     because the preexisting optimizer might no longer behave as expected.
 
-    ``init_optimizer`` may be any Pytorch optimizer. 
-    It may contain a mixture of fp16 and fp32 parameters organized into any number of 
-    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will 
-    ingest these ``param_groups`` and remember them. 
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
 
     Calls to ::
 
-        loss.backward() 
+        loss.backward()
 
     must be replaced with ::
 
-        optimizer.backward(loss)  
+        optimizer.backward(loss)
 
-    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement 
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
     loss scaling and copies to master gradients.
 
     .. note::
         Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
         are downscaled before being applied.  This means that adjusting the loss scale, or using
-        dynamic loss scaling, should not require retuning the learning rate or any other 
+        dynamic loss scaling, should not require retuning the learning rate or any other
         hyperparameters.
 
 
@@ -144,7 +160,7 @@ class FP16_Optimizer(object):
     See docstring for :attr:`step`.
 
     **Gradient clipping**:  Use :attr:`clip_master_grads`.
-    
+
     **Multiple losses**:  If your model accumulates gradients from multiple losses,
     this can be made more efficient by supplying ``update_master_grads=False``
     to :attr:`backward`.  See docstring for :attr:`backward`.
@@ -155,19 +171,19 @@ class FP16_Optimizer(object):
         optimizer.loss_scale = new_loss_scale
 
     For static loss scaling, manually adjusting the loss scale over time is a reasonable
-    thing to do.  During later epochs, gradients may become smaller, and a 
+    thing to do.  During later epochs, gradients may become smaller, and a
     higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
-    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting 
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
     the loss scale is not recommended.
 
     **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
-    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` 
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
     should still work as intended.
     """
 
-    def __init__(self, 
-                 init_optimizer, 
-                 static_loss_scale=1.0, 
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
                  dynamic_loss_scale=False,
                  dynamic_loss_args=None,
                  verbose=False):
@@ -197,12 +213,14 @@ def __init__(self,
                         fp16_params_this_group.append(param)
                         master_param = param.detach().clone().float()
                         master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
                         param_group['params'][i] = master_param
                         fp32_from_fp16_params_this_group.append(master_param)
                         # Reset existing state dict key to the new master param.
                         # We still need to recast per-param state tensors, if any, to FP32.
                         if param in self.optimizer.state:
-                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
+                            self.optimizer.state[master_param] = self.optimizer.state.pop(param)
                     elif param.type() == 'torch.cuda.FloatTensor':
                         self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
                                          .format(param.size()))
@@ -210,9 +228,9 @@ def __init__(self,
                         param_group['params'][i] = param
                     else:
                         raise TypeError("Wrapped parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                         "Received {}".format(param.type()))
-            
+
             self.fp16_groups.append(fp16_params_this_group)
             self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
             self.fp32_from_fp32_groups.append(fp32_params_this_group)
@@ -240,7 +258,7 @@ def __init__(self,
     def maybe_print(self, msg):
         if self.verbose:
             print(msg)
-            
+
     def __getstate__(self):
         raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
 
@@ -255,13 +273,13 @@ def zero_grad(self, set_grads_to_None=False):
         # because gradients are copied into the FP32 master params.  However, we zero
         # all gradients owned by the optimizer, just to be safe:
         for group in self.optimizer.param_groups:
-             for p in group['params']:
-                 if set_grads_to_None:
-                     p.grad = None
-                 else:
-                     if p.grad is not None:
-                         p.grad.detach_()
-                         p.grad.zero_()
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
 
         # Zero fp16 gradients owned by the model:
         for fp16_group in self.fp16_groups:
@@ -270,11 +288,11 @@ def zero_grad(self, set_grads_to_None=False):
                     param.grad = None
                 else:
                     if param.grad is not None:
-                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
+                        param.grad.detach_()  # as in torch.optim.optimizer.zero_grad()
                         param.grad.zero_()
 
     def _check_overflow(self):
-        params = [] 
+        params = []
         for group in self.fp16_groups:
             for param in group:
                 params.append(param)
@@ -294,8 +312,9 @@ def _model_params_to_master_params(self):
         for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
             master_params_to_model_params(fp32_from_fp16_group, fp16_group)
 
-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable 
-    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32
+    # allreduce in a different stream.
     def _model_grads_to_master_grads(self):
         for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
             model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
@@ -303,10 +322,13 @@ def _model_grads_to_master_grads(self):
     def _downscale_master(self):
         if self.loss_scale != 1.0:
             for group in self.optimizer.param_groups:
-                for param in group['params']:
-                    if param.grad is not None:
-                        param.grad.data.mul_(1./self.loss_scale)
-
+                grads = [p.grad for p in group['params'] if p.grad is not None]
+                _overflow_buf = torch.cuda.IntTensor([0])
+                multi_tensor_applier(amp_C.multi_tensor_scale,
+                                     _overflow_buf,
+                                     [grads, grads],
+                                     1./self.loss_scale)
+      
     def clip_master_grads(self, max_norm, norm_type=2):
         """
         Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
@@ -354,9 +376,9 @@ def state_dict(self):
 
     def load_state_dict(self, state_dict):
         """
-        Loads a state_dict created by an earlier call to state_dict(). 
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
-        whose parameters in turn came from ``model``, it is expected that the user 
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
         will call ``model.load_state_dict()`` before
         ``fp16_optimizer_instance.load_state_dict()`` is called.
 
@@ -377,33 +399,34 @@ def load_state_dict(self, state_dict):
         self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
         self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
         # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.  
+        # The optimizer's hyperparameters and internal buffers are also up to date.
         # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.  
-        # 1:  Refresh the master params from the model's fp16 params.  
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
         # This requires less storage but incurs precision loss.
         # 2:  Save and restore the fp32 master copies separately.
         # We choose option 2.
-        # 
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
-        # of their associated parameters, because it's possible those buffers might not exist yet in 
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
         # constructed in the same way as the one whose state_dict we are loading, the same master params
         # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
+        for current_group, saved_group in zip(
+                self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
             for current, saved in zip(current_group, saved_group):
                 current.data.copy_(saved.data)
 
-    def step(self, closure=None): # could add clip option.
+    def step(self, closure=None):  # could add clip option.
         """
-        If no closure is supplied, :attr:`step` should be called after 
+        If no closure is supplied, :attr:`step` should be called after
         ``fp16_optimizer_obj.backward(loss)``.
         :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
         :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
         originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
         another forward pass using their model.
 
-        If a closure is supplied, :attr:`step` may be called without a prior call to 
+        If a closure is supplied, :attr:`step` may be called without a prior call to
         :attr:`backward(loss)`.
         This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
         However, the user should take care that any ``loss.backward()`` call within the closure
@@ -414,7 +437,7 @@ def step(self, closure=None): # could add clip option.
 
         Example with closure::
 
-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an 
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
             # existing pytorch optimizer.
             for input, target in dataset:
                 def closure():
@@ -438,9 +461,9 @@ def closure():
 
         if self.overflow:
             self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
-                .format(scale, self.loss_scale))
+                             .format(scale, self.loss_scale))
             return
-        
+
         if closure is not None:
             retval = self._step_with_closure(closure)
         else:
@@ -462,7 +485,7 @@ def wrapped_closure():
                 self.first_closure_call_this_step = False
             else:
                 # If self.optimizer.step() internally calls wrapped_closure more than once,
-                # it may update the fp32 params after each call.  However, self.optimizer 
+                # it may update the fp32 params after each call.  However, self.optimizer
                 # doesn't know about the fp16 params at all.  If the fp32 params get updated,
                 # we can't rely on self.optimizer to refresh the fp16 params.  We need
                 # to handle that manually:
@@ -470,16 +493,16 @@ def wrapped_closure():
             # Our API expects the user to give us ownership of the backward() call by
             # replacing all calls to loss.backward() with optimizer.backward(loss).
             # This requirement holds whether or not the call to backward() is made within a closure.
-            # If the user is properly calling optimizer.backward(loss) within "closure," 
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
             # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, so all wrapped_closure needs to do is call 
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
             # closure() and return the loss.
-            temp_loss = closure() 
+            temp_loss = closure()
             while(self.overflow):
                 scale = self.loss_scaler.loss_scale
                 self._update_scale(self.overflow)
                 self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
-                      "reducing to {}".format(scale, self.loss_scale))
+                                 "reducing to {}".format(scale, self.loss_scale))
                 temp_loss = closure()
             return temp_loss
 
@@ -490,7 +513,7 @@ def wrapped_closure():
         return retval
 
     def backward(self, loss, update_master_grads=True, retain_graph=False):
-        """ 
+        """
         :attr:`backward` performs the following conceptual steps:
 
         1. fp32_loss = loss.float() (see first Note below)
@@ -504,19 +527,19 @@ def backward(self, loss, update_master_grads=True, retain_graph=False):
 
         .. note::
             :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
-            This provides some additional safety against overflow if the user has supplied an 
-            fp16 loss value.  
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
             However, for maximum overflow safety, the user should
-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to 
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
             :attr:`backward`.
 
         .. warning::
-            The gradients found in a model's leaves after the call to 
-            :attr:`backward` should not be regarded as valid in general, 
-            because it's possible 
-            they have been scaled (and in the case of dynamic loss scaling, 
-            the scale factor may change over time).  
-            If the user wants to inspect gradients after a call to :attr:`backward`,  
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
             only the master gradients should be regarded as valid.  These can be retrieved via
             :attr:`inspect_master_grad_data()`.
 
@@ -531,54 +554,55 @@ def backward(self, loss, update_master_grads=True, retain_graph=False):
             optimizer.backward(loss)
 
             # Naive operation with multiple losses (technically valid, but less efficient):
-            # fp32 grads will be correct after the second call,  but 
+            # fp32 grads will be correct after the second call,  but
             # the first call incurs an unnecessary fp16->fp32 grad copy.
             optimizer.backward(loss1)
             optimizer.backward(loss2)
 
             # More efficient way to handle multiple losses:
-            # The fp16->fp32 grad copy is delayed until fp16 grads from all 
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
             # losses have been accumulated.
             optimizer.backward(loss1, update_master_grads=False)
             optimizer.backward(loss2, update_master_grads=False)
             optimizer.update_master_grads()
-        """ 
-        # To consider:  try multiple backward passes using retain_grad=True to find 
+        """
+        # To consider:  try multiple backward passes using retain_grad=True to find
         # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
-        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
         self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
         if update_master_grads:
             self.update_master_grads()
 
     def update_master_grads(self):
         """
-        Copy the ``.grad`` attribute from stored references to fp16 parameters to 
-        the ``.grad`` attribute of the fp32 master parameters that are directly 
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
         updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
         ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
         """
         if self.dynamic_loss_scale:
             self._check_overflow()
-            if self.overflow: return
+            if self.overflow:
+                return
         self._model_grads_to_master_grads()
         self._downscale_master()
 
     def inspect_master_grad_data(self):
         """
-        When running with :class:`FP16_Optimizer`, 
+        When running with :class:`FP16_Optimizer`,
         ``.grad`` attributes of a model's fp16 leaves should not be
-        regarded as truthful, because they might be scaled.  
+        regarded as truthful, because they might be scaled.
         After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
         the fp32 master params' ``.grad``
-        attributes will contain valid gradients properly divided by the loss scale.  However, 
-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be 
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
         nonintuitive.  :attr:`inspect_master_grad_data`
         allows those gradients to be viewed with shapes corresponding to their associated model leaves.
 
         Returns:
             List of lists (one list for each parameter group).  The list for each parameter group
-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.                 
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
         """
         if self.overflow:
             print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
@@ -597,8 +621,8 @@ def inspect_master_grad_data(self):
                 master_grads_data.append(master_grads_this_group)
             return master_grads_data
 
-
     # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+
     def _get_loss_scale(self):
         return self.loss_scaler.loss_scale
 
diff --git a/fp16/fp16util.py b/megatron/fp16/fp16util.py
similarity index 88%
rename from fp16/fp16util.py
rename to megatron/fp16/fp16util.py
index 469ed049..0266ede3 100644
--- a/fp16/fp16util.py
+++ b/megatron/fp16/fp16util.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,11 @@
 from torch.autograd import Variable
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from megatron import mpu
+
 
 class tofp16(nn.Module):
     """
@@ -100,6 +105,7 @@ def forward(self, *inputs):
 def backwards_debug_hook(grad):
     raise RuntimeError("master_params recieved a gradient in the backward pass!")
 
+
 def prep_param_lists(model, flat_master=False):
     """
     Creates a list of FP32 master parameters for a given model, as in
@@ -129,9 +135,9 @@ def prep_param_lists(model, flat_master=False):
             # flatten_dense_tensors returns a contiguous flat array.
             # http://pytorch.org/docs/master/_modules/torch/_utils.html
             master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except:
+        except BaseException:
             print("Error in prep_param_lists:  model may contain a mixture of parameters "
-                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
+                  "of different types.  Use flat_master=False, or use F16_Optimizer.")
             raise
         master_params = torch.nn.Parameter(master_params)
         master_params.requires_grad = True
@@ -148,7 +154,7 @@ def prep_param_lists(model, flat_master=False):
 
 def model_grads_to_master_grads(model_params, master_params, flat_master=False):
     """
-    Copy model gradients to master gradients.  
+    Copy model gradients to master gradients.
 
     Args:
         model_params:  List of model parameters created by :func:`prep_param_lists`.
@@ -163,9 +169,15 @@ def model_grads_to_master_grads(model_params, master_params, flat_master=False):
             if model.grad is not None:
                 if master.grad is None:
                     master.grad = Variable(master.data.new(*master.data.size()))
-                master.grad.data.copy_(model.grad.data)
             else:
                 master.grad = None
+        model_grads = [p.grad for p in model_params if p.grad is not None]
+        master_grads = [p.grad for p in master_params if p.grad is not None]
+        _overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             _overflow_buf,
+                             [model_grads, master_grads],
+                             1.0)
 
 
 def master_params_to_model_params(model_params, master_params, flat_master=False):
@@ -177,7 +189,7 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
         master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
     """
     if flat_master:
-        for model, master in zip(model_params, 
+        for model, master in zip(model_params,
                                  _unflatten_dense_tensors(master_params[0].data, model_params)):
             model.data.copy_(master)
     else:
@@ -186,15 +198,19 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
 
 # Backward compatibility fixes
 
+
 def to_python_float(t):
     if hasattr(t, 'item'):
         return t.item()
     else:
         return t[0]
 
+
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])
-if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
-    clip_grad_norm = torch.nn.utils.clip_grad_norm
-else:
-    clip_grad_norm = torch.nn.utils.clip_grad_norm_
\ No newline at end of file
+
+clip_grad_norm = mpu.clip_grad_norm
+# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+#    clip_grad_norm = torch.nn.utils.clip_grad_norm
+# else:
+#    clip_grad_norm = torch.nn.utils.clip_grad_norm_
diff --git a/fp16/loss_scaler.py b/megatron/fp16/loss_scaler.py
similarity index 81%
rename from fp16/loss_scaler.py
rename to megatron/fp16/loss_scaler.py
index a9f8e0f0..126b7863 100755
--- a/fp16/loss_scaler.py
+++ b/megatron/fp16/loss_scaler.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,19 +15,27 @@
 
 import torch
 
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from megatron import mpu
+
 # item() is a recent addition, so this helps with backward compatibility.
+
+
 def to_python_float(t):
     if hasattr(t, 'item'):
         return t.item()
     else:
         return t[0]
 
+
 class LossScaler:
     """
     Class that manages a static loss scale.  This class is intended to interact with
     :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
 
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
     :class:`FP16_Optimizer`'s constructor.
 
     Args:
@@ -53,16 +61,22 @@ def loss_scale(self):
         return self.cur_scale
 
     def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
+        _overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             _overflow_buf,
+                             [grad_in, grad_in],
+                             self.loss_scale)
+        return grad_in
 
     def backward(self, loss, retain_graph=False):
-        scaled_loss = loss*self.loss_scale
+        scaled_loss = loss * self.loss_scale
         scaled_loss.backward(retain_graph=retain_graph)
 
+
 class DynamicLossScaler:
     """
     Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
     :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
     operates, because the default options can be changed using the
     the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
@@ -70,18 +84,18 @@ class DynamicLossScaler:
     Loss scaling is designed to combat the problem of underflowing gradients encountered at long
     times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
     scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
     occurred.
     :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.  
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
     If a certain number of iterations occur without overflowing gradients detected,
     :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
     always using the highest loss scale possible without incurring overflow.
 
     Args:
         init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
         scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
     """
 
@@ -103,18 +117,30 @@ def __init__(self,
         self.consecutive_hysteresis = consecutive_hysteresis
 
     # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
+    def has_overflow_serial(self, params):
         for p in params:
             if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
                 return True
 
         return False
 
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        torch.distributed.all_reduce(overflow_gpu,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=mpu.get_model_parallel_group())
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
     # `x` is a torch.Tensor
+
     def _has_inf_or_nan(x):
         try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if 
-            # Pytorch's .sum() creates a one-element tensor of the same type as x 
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
             # (which is true for some recent version of pytorch).
             cpu_sum = float(x.float().sum())
             # More efficient version that can be used if .sum() returns a Python scalar
@@ -133,6 +159,7 @@ def _has_inf_or_nan(x):
 
     # `overflow` is boolean indicating whether the gradient overflowed
     def update_scale(self, overflow):
+
         if not hasattr(self, 'min_scale'):
             self.min_scale = 1
         if not hasattr(self, 'delayed_shift'):
@@ -144,7 +171,7 @@ def update_scale(self, overflow):
         if overflow:
             # self.cur_scale /= self.scale_factor
             if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale)
+                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
             else:
                 self.cur_hysteresis -= 1
             self.last_overflow_iter = self.cur_iter
@@ -162,13 +189,19 @@ def loss_scale(self):
         return self.cur_scale
 
     def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
+        _overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             _overflow_buf,
+                             [grad_in, grad_in],
+                             self.loss_scale)
+        return grad_in
 
     def backward(self, loss, retain_graph=False):
-        scaled_loss = loss*self.loss_scale
+        scaled_loss = loss * self.loss_scale
         scaled_loss.backward(retain_graph=retain_graph)
-        
-##############################################################        
+
+
+##############################################################
 # Example usage below here -- assuming it's in a separate file
 ##############################################################
 """
@@ -204,10 +237,10 @@ def backward(self, loss, retain_graph=False):
         # Run backprop
         optimizer.zero_grad()
         loss.backward()
-        
+
         # Check for overflow
         has_overflow = DynamicLossScaler.has_overflow(parameters)
-        
+
         # If no overflow, unscale grad and update as usual
         if not has_overflow:
             for param in parameters:
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
new file mode 100644
index 00000000..8d5d8633
--- /dev/null
+++ b/megatron/fused_kernels/__init__.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import subprocess
+import os
+from torch.utils import cpp_extension
+
+# Setting this param to a list has a problem of generating
+# different compilation commands (with diferent order of architectures)
+# and leading to recompilation of fused kernels.
+# set it to empty string to avoid recompilation
+# and assign arch flags explicity in extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                         universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+def create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
+
+def load_scaled_upper_triang_masked_softmax_fusion_kernel():
+
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_80,code=sm_80')
+
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / 'build'
+
+    create_build_dir(buildpath)
+
+    scaled_upper_triang_masked_softmax_cuda = cpp_extension.load(
+        name='scaled_upper_triang_masked_softmax_cuda',
+        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
+                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'],
+        build_directory=buildpath,
+        extra_cflags=['-O3',],
+        extra_cuda_cflags=['-O3',
+                           '-gencode', 'arch=compute_70,code=sm_70',
+                           '-U__CUDA_NO_HALF_OPERATORS__',
+                           '-U__CUDA_NO_HALF_CONVERSIONS__',
+                           '--expt-relaxed-constexpr',
+                           '--expt-extended-lambda',
+                           '--use_fast_math'] + cc_flag)
+
+def load_scaled_masked_softmax_fusion_kernel():
+
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_80,code=sm_80')
+
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / 'build'
+
+    create_build_dir(buildpath)
+
+    scaled_upper_triang_masked_softmax_cuda = cpp_extension.load(
+        name='scaled_masked_softmax_cuda',
+        sources=[srcpath / 'scaled_masked_softmax.cpp',
+                 srcpath / 'scaled_masked_softmax_cuda.cu'],
+        build_directory=buildpath,
+        extra_cflags=['-O3',],
+        extra_cuda_cflags=['-O3',
+                           '-gencode', 'arch=compute_70,code=sm_70',
+                           '-U__CUDA_NO_HALF_OPERATORS__',
+                           '-U__CUDA_NO_HALF_CONVERSIONS__',
+                           '--expt-relaxed-constexpr',
+                           '--expt-extended-lambda',
+                           '--use_fast_math'] + cc_flag)
diff --git a/megatron/fused_kernels/scaled_masked_softmax.cpp b/megatron/fused_kernels/scaled_masked_softmax.cpp
new file mode 100644
index 00000000..87a55dfa
--- /dev/null
+++ b/megatron/fused_kernels/scaled_masked_softmax.cpp
@@ -0,0 +1,74 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    torch::Tensor const& mask,
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+torch::Tensor fwd(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor) {
+  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
+  AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 
+      "Only HALF is supported");
+  AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
+
+  return fwd_cuda(input, mask, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
+
+  AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 
+      "Only HALF is supported");
+  AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 
+      "Only HALF is supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+} // end namespace scaled_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("backward", 
+        &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h
new file mode 100644
index 00000000..c327a1b1
--- /dev/null
+++ b/megatron/fused_kernels/scaled_masked_softmax.h
@@ -0,0 +1,452 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Explicit masking
+ */	
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src,
+    const uint8_t *mask, 
+    const acc_t scale, 
+    int batch_size, 
+    int stride, 
+    int element_count,
+    int pad_batches) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
+    int pad_first_batch = 0;
+    if (pad_batches != 1) { // bert style
+    	pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
+    } else { // gpt2 style
+        pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    }
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * stride + local_idx;
+    dst += first_batch * stride + local_idx;
+    mask += pad_first_batch * stride + local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+	#pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            int itr_idx = i*element_count+it*WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+	        if (mask[itr_idx] != 1) {
+		    elements[i][it] = (acc_t)src[itr_idx] * scale;
+		} else {
+                    elements[i][it] = -10000.0;
+		} 
+            } else {
+                elements[i][it] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+            sum[i] += elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                dst[i*element_count+it*WARP_SIZE] = (output_t)(elements[i][it] / sum[i]);
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int batch_size, 
+    int stride, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+	    if (element_index < batch_element_count) {
+                output_reg[i][it] = output[i*element_count+it*WARP_SIZE];
+	    } else {
+                output_reg[i][it] = acc_t(0);
+            }
+        }
+
+       #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+	    if (element_index < batch_element_count) {
+                grad_reg[i][it] = (acc_t)grad[i*element_count+it*WARP_SIZE] * output_reg[i][it];
+	    } else {
+                grad_reg[i][it] = acc_t(0);
+	    }
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                gradInput[i*element_count+it*WARP_SIZE] = (output_t)(scale * (grad_reg[i][it] - output_reg[i][it] * sum[i]));
+            } 
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const uint8_t *mask,
+    const input_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int batches,
+    int attn_heads,
+    int pad_batches)
+{
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = batches * attn_heads * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+	int batches_per_block = warps_per_block * batches_per_warp;
+	TORCH_INTERNAL_ASSERT(seq_len%batches_per_block == 0);
+        dim3 blocks(seq_len/batches_per_block, attn_heads, batches);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, softmax_elements_stride, softmax_elements, pad_batches);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int batches,
+    int attn_heads)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = batches *  attn_heads * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+	int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = batch_count/batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
new file mode 100644
index 00000000..63aaccd4
--- /dev/null
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -0,0 +1,102 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include "THC/THC.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_masked_softmax.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor)
+{
+  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = input.size(0);
+  const int pad_batches = mask.size(0);
+  const int attn_heads = input.size(1);
+  const int seq_len = input.size(2);
+  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
+  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
+  TORCH_INTERNAL_ASSERT(mask.size(2) == seq_len);
+  TORCH_INTERNAL_ASSERT(mask.size(3) == seq_len);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({batches, attn_heads, seq_len, seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* mask_ptr = static_cast<void*>(mask.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  dispatch_scaled_masked_softmax_forward<half, half, float>(
+      reinterpret_cast<half*>(softmax_results_ptr),
+      reinterpret_cast<const half*>(input_ptr),
+      reinterpret_cast<const uint8_t*>(mask_ptr),
+      scale_factor,
+      seq_len,
+      seq_len,
+      batches,
+      attn_heads,
+      pad_batches);
+  return softmax_results;
+}
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = output_grads.size(0);
+  const int attn_heads = output_grads.size(1);
+  const int seq_len = output_grads.size(2);
+  TORCH_INTERNAL_ASSERT(output_grads.size(2) == output_grads.size(3));
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  dispatch_scaled_masked_softmax_backward<half, half, float>(
+      reinterpret_cast<half*>(output_grads_ptr), 
+      reinterpret_cast<half*>(output_grads_ptr), 
+      reinterpret_cast<half const*>(softmax_results.data_ptr()),
+      scale_factor,
+      seq_len,
+      seq_len,
+      batches,
+      attn_heads);
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
new file mode 100644
index 00000000..af5a0c58
--- /dev/null
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
@@ -0,0 +1,69 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
+  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 
+      "Only HALF is supported");
+
+  return fwd_cuda(input, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
+
+  AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 
+      "Only HALF is supported");
+  AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 
+      "Only HALF is supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+} // end namespace scaled_upper_triang_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("backward", 
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
new file mode 100644
index 00000000..6f448a36
--- /dev/null
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -0,0 +1,439 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Implicit time (diagonal masking)
+ */	
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const acc_t scale, 
+    int batch_size, 
+    int stride, 
+    int element_count) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    int warp_iteration_limit = (local_seq + WARP_SIZE - 1)/WARP_SIZE;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * stride + local_idx;
+    dst += first_batch * stride + local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+	#pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                elements[i][it] = (acc_t)src[i*element_count*stride+it*WARP_SIZE] * scale; 
+            } else {
+                elements[i][it] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+	    if (it < warp_iteration_limit) {
+                elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+                sum[i] += elements[i][it];
+	    } 
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < local_seq) {
+                dst[i*element_count*stride+it*WARP_SIZE] = (output_t)(elements[i][it] / sum[i]);
+            } else if (element_index < element_count) {
+                dst[i*element_count*stride+it*WARP_SIZE] = 0;
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int batch_size, 
+    int stride, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+	    if (element_index < batch_element_count) {
+                output_reg[i][it] = output[i*element_count*stride+it*WARP_SIZE];
+	    } else {
+                output_reg[i][it] = acc_t(0);
+            }
+        }
+
+       #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+	    if (element_index < batch_element_count) {
+                grad_reg[i][it] = (acc_t)grad[i*element_count*stride+it*WARP_SIZE] * output_reg[i][it];
+	    } else {
+                grad_reg[i][it] = acc_t(0);
+	    }
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                gradInput[i*element_count*stride+it*WARP_SIZE] = (output_t)(scale * (grad_reg[i][it] - output_reg[i][it] * sum[i]));
+            } 
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const input_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
new file mode 100644
index 00000000..ffd27579
--- /dev/null
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -0,0 +1,89 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include "THC/THC.h"
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_upper_triang_masked_softmax.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor)
+{
+  // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = input.size(0);
+  const int seq_len = input.size(1);
+  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({attn_batches, seq_len, seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  dispatch_scaled_upper_triang_masked_softmax_forward<half, half, float>(
+      reinterpret_cast<half*>(softmax_results_ptr),
+      reinterpret_cast<const half*>(input_ptr),
+      scale_factor,
+      seq_len,
+      seq_len,
+      attn_batches);
+  return softmax_results;
+}
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = output_grads.size(0);
+  const int seq_len = output_grads.size(1);
+  TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  dispatch_scaled_upper_triang_masked_softmax_backward<half, half, float>(
+      reinterpret_cast<half*>(output_grads_ptr), 
+      reinterpret_cast<half*>(output_grads_ptr), 
+      reinterpret_cast<half const*>(softmax_results.data_ptr()),
+      scale_factor,
+      seq_len,
+      seq_len,
+      attn_batches);
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
new file mode 100644
index 00000000..8d72a0b3
--- /dev/null
+++ b/megatron/global_vars.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron global variables."""
+
+import os
+import sys
+import time
+
+import torch
+
+from megatron.tokenizer import build_tokenizer
+from .arguments import parse_args
+
+_GLOBAL_ARGS = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_ADLR_AUTORESUME = None
+_GLOBAL_TIMERS = None
+
+
+def get_args():
+    """Return arguments."""
+    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+    return _GLOBAL_ARGS
+
+
+def get_tokenizer():
+    """Return tokenizer."""
+    _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    return _GLOBAL_TOKENIZER
+
+
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+
+
+def get_adlr_autoresume():
+    """ADLR autoresume object. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ADLR_AUTORESUME
+
+
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+
+
+def set_global_variables(extra_args_provider=None, args_defaults={},
+                         ignore_unknown_args=False):
+    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
+    args = _parse_args(extra_args_provider=extra_args_provider,
+                       defaults=args_defaults,
+                       ignore_unknown_args=ignore_unknown_args)
+    _ = _build_tokenizer(args)
+    _set_tensorboard_writer(args)
+    _set_adlr_autoresume(args)
+    _set_timers()
+
+
+def _parse_args(extra_args_provider=None, defaults={},
+                ignore_unknown_args=False):
+    """Parse entire arguments."""
+    global _GLOBAL_ARGS
+    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
+    _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,
+                              defaults=defaults,
+                              ignore_unknown_args=ignore_unknown_args)
+    return _GLOBAL_ARGS
+
+
+def _build_tokenizer(args):
+    """Initialize tokenizer."""
+    global _GLOBAL_TOKENIZER
+    _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    _GLOBAL_TOKENIZER = build_tokenizer(args)
+    return _GLOBAL_TOKENIZER
+
+
+def rebuild_tokenizer(args):
+    global _GLOBAL_TOKENIZER
+    _GLOBAL_TOKENIZER = None
+    return _build_tokenizer(args)
+
+
+def _set_tensorboard_writer(args):
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
+                                   'tensorboard writer')
+
+    if hasattr(args, 'tensorboard_dir') and \
+       args.tensorboard_dir and args.rank == 0:
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            print('> setting tensorboard ...')
+            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
+                log_dir=args.tensorboard_dir)
+        except ModuleNotFoundError:
+            print('WARNING: TensorBoard writing requested but is not '
+                  'available (are you using PyTorch 1.1.0 or later?), '
+                  'no TensorBoard logs will be written.', flush=True)
+
+
+def _set_adlr_autoresume(args):
+    """Initialize ADLR autoresume."""
+    global _GLOBAL_ADLR_AUTORESUME
+    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
+
+    if args.adlr_autoresume:
+        if args.rank == 0:
+            print('enabling autoresume ...', flush=True)
+        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
+        try:
+            from userlib.auto_resume import AutoResume
+        except BaseException:
+            print('ADLR autoresume is not available, exiting ...')
+            sys.exit()
+
+        _GLOBAL_ADLR_AUTORESUME = AutoResume
+
+
+def _set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers()
+
+
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+
+
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
+
+
+class _Timer:
+    """Timer."""
+
+    def __init__(self, name):
+        self.name_ = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+
+    def start(self):
+        """Start the timer."""
+        assert not self.started_, 'timer has already been started'
+        torch.cuda.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+
+    def stop(self):
+        """Stop the timer."""
+        assert self.started_, 'timer is not started'
+        torch.cuda.synchronize()
+        self.elapsed_ += (time.time() - self.start_time)
+        self.started_ = False
+
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+
+    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
+        """Write timers to a tensorboard writer"""
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+            writer.add_scalar(name + '_time', value, iteration)
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == 0:
+                print(string, flush=True)
+        else:
+            print(string, flush=True)
diff --git a/megatron/indexer.py b/megatron/indexer.py
new file mode 100644
index 00000000..fd65c3f4
--- /dev/null
+++ b/megatron/indexer.py
@@ -0,0 +1,91 @@
+import torch
+import torch.distributed as dist
+
+from megatron import get_args
+from megatron import mpu
+from megatron.checkpointing import load_ict_checkpoint
+from megatron.data.ict_dataset import get_ict_dataset
+from megatron.data.realm_dataset_utils import get_one_epoch_dataloader
+from megatron.data.realm_index import detach, BlockData
+from megatron.data.realm_dataset_utils import get_ict_batch
+from megatron.model.realm_model import general_ict_model_provider
+from megatron.training import get_model
+
+
+class IndexBuilder(object):
+    """Object for taking one pass over a dataset and creating a BlockData of its embeddings"""
+    def __init__(self):
+        args = get_args()
+        self.model = None
+        self.dataloader = None
+        self.block_data = None
+
+        # need to know whether we're using a REALM checkpoint (args.load) or ICT checkpoint
+        assert not (args.load and args.ict_load)
+        self.using_realm_chkpt = args.ict_load is None
+
+        self.log_interval = args.indexer_log_interval
+        self.batch_size = args.indexer_batch_size
+
+        self.load_attributes()
+        self.is_main_builder = mpu.get_data_parallel_rank() == 0
+        self.num_total_builders = mpu.get_data_parallel_world_size()
+        self.iteration = self.total_processed = 0
+
+    def load_attributes(self):
+        """Load the necessary attributes: model, dataloader and empty BlockData"""
+        model = get_model(lambda: general_ict_model_provider(only_block_model=True))
+        self.model = load_ict_checkpoint(model, only_block_model=True, from_realm_chkpt=self.using_realm_chkpt)
+        self.model.eval()
+        self.dataset = get_ict_dataset()
+        self.dataloader = iter(get_one_epoch_dataloader(self.dataset, self.batch_size))
+        self.block_data = BlockData(load_from_path=False)
+
+    def track_and_report_progress(self, batch_size):
+        """Utility function for tracking progress"""
+        self.iteration += 1
+        self.total_processed += batch_size * self.num_total_builders
+        if self.is_main_builder and self.iteration % self.log_interval == 0:
+            print('Batch {:10d} | Total {:10d}'.format(self.iteration, self.total_processed), flush=True)
+
+    def build_and_save_index(self):
+        """Goes through one epoch of the dataloader and adds all data to this instance's BlockData.
+
+        The copy of BlockData is saved as a shard, which when run in a distributed setting will be
+        consolidated by the rank 0 process and saved as a final pickled BlockData.
+        """
+
+        while True:
+            try:
+                # batch also has query_tokens and query_pad_data
+                _, _, block_tokens, block_pad_mask, block_sample_data = get_ict_batch(self.dataloader)
+            except (StopIteration, IndexError):
+                break
+
+            unwrapped_model = self.model
+            while not hasattr(unwrapped_model, 'embed_block'):
+                unwrapped_model = unwrapped_model.module
+
+            # detach, separate fields and add to BlockData
+            block_logits = detach(unwrapped_model.embed_block(block_tokens, block_pad_mask))
+            detached_data = detach(block_sample_data)
+
+            # block_sample_data is a 2D array [batch x 4]
+            # with columns [start_idx, end_idx, doc_idx, block_idx] same as class BlockSampleData
+            block_indices = detached_data[:, 3]
+            block_metas = detached_data[:, :3]
+
+            self.block_data.add_block_data(block_indices, block_logits, block_metas)
+            self.track_and_report_progress(batch_size=block_tokens.shape[0])
+
+        # This process signals to finalize its shard and then synchronize with the other processes
+        self.block_data.save_shard()
+        torch.distributed.barrier()
+        del self.model
+
+        # rank 0 process builds the final copy
+        if self.is_main_builder:
+            self.block_data.merge_shards_and_save()
+            # make sure that every single piece of data was embedded
+            assert len(self.block_data.embed_data) == len(self.dataset)
+        self.block_data.clear()
diff --git a/megatron/initialize.py b/megatron/initialize.py
new file mode 100644
index 00000000..63f8532b
--- /dev/null
+++ b/megatron/initialize.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron initialization."""
+
+import random
+import os
+
+import numpy as np
+import torch
+
+from megatron import get_adlr_autoresume
+from megatron import get_args
+from megatron import get_tensorboard_writer
+from megatron import mpu
+from megatron.global_vars import set_global_variables
+from megatron.mpu import set_model_parallel_rank, set_model_parallel_world_size
+
+import deepspeed
+
+
+def initialize_megatron(extra_args_provider=None, args_defaults={},
+                        ignore_unknown_args=False, allow_no_cuda=False):
+    """Set global variables, initialize distributed, and
+    set autoresume and random seeds.
+    `allow_no_cuda` should not be set unless using megatron for cpu only 
+    data processing. In general this arg should not be set unless you know 
+    what you are doing.
+    Returns a function to finalize distributed env initialization 
+    (optionally, only when args.lazy_mpu_init == True)
+
+"""
+    if not allow_no_cuda:
+        # Make sure cuda is available.
+        assert torch.cuda.is_available(), 'Megatron requires CUDA.'
+
+    # Parse args, build tokenizer, and set adlr-autoresume,
+    # tensorboard-writer, and timers.
+    set_global_variables(extra_args_provider=extra_args_provider,
+                         args_defaults=args_defaults,
+                         ignore_unknown_args=ignore_unknown_args)
+
+    # torch.distributed initialization
+    def finish_mpu_init():
+        args = get_args()
+        # Pytorch distributed.
+        _initialize_distributed()
+        
+        # Random seeds for reproducibility.
+        if args.rank == 0:
+            print('> setting random seeds to {} ...'.format(args.seed))
+        _set_random_seed(args.seed)
+
+    args = get_args()
+    if  args.lazy_mpu_init:
+        args.use_cpu_initialization=True
+        # delayed initialization of DDP-related stuff
+        # We only set basic DDP globals    
+        set_model_parallel_world_size(args.model_parallel_size)
+        # and return function for external DDP manager to call when it has DDP initialized
+        set_model_parallel_rank(args.rank)    
+        return finish_mpu_init
+    else:
+        # Megatron's MPU is the master. Complete initialization right away.
+        finish_mpu_init()
+
+        # Initialize memory buffers.
+        _initialize_mem_buffs()
+        
+        # Autoresume.
+        _init_autoresume()
+        
+        # Write arguments to tensorboard.
+        _write_args_to_tensorboard()
+        # No continuation function
+        return None
+        
+
+def setup_deepspeed_random_and_activation_checkpointing(args):
+    '''Optional DeepSpeed Activation Checkpointing features.
+    Gives access to partition activations, contiguous memory optimizations
+    and cpu checkpointing.
+
+    Activation checkpoint requires keep track of the random states
+    and setting the random seed for each MP process. Megatron uses
+    mpu.get_cuda_rng_tracker and mpu.model_parallel_cuda_manual_seed
+    for keeping track of the random states and setting the random seeds.
+    Since they are used in places outside of activation checkpointing,
+    we overwrite them to maintain consistency.
+
+    This must be called before all the calls to mpu.model_parallel_cuda_manual_seed
+    '''
+    num_layers = args.num_layers // args.checkpoint_num_layers
+    num_layers = num_layers if args.num_layers % args.checkpoint_num_layers == 0 else num_layers + 1
+
+    deepspeed.checkpointing.configure(
+        mpu,
+        partition_activations=args.partition_activations,
+        contiguous_checkpointing=args.contigious_checkpointing,
+        num_checkpoints=num_layers,
+        checkpoint_in_cpu=args.checkpoint_in_cpu,
+        synchronize=args.synchronize_each_layer,
+        profile=args.profile_backward)
+
+    mpu.checkpoint = deepspeed.checkpointing.checkpoint
+    mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+    mpu.model_parallel_cuda_manual_seed = deepspeed.checkpointing.model_parallel_cuda_manual_seed
+
+
+def _initialize_distributed():
+    """Initialize torch.distributed and mpu."""
+    args = get_args()
+
+    device_count = torch.cuda.device_count()
+    if torch.distributed.is_initialized():
+
+        if args.rank == 0:
+            print('torch distributed is already initialized, '
+                  'skipping initialization ...', flush=True)
+        args.rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+
+    else:
+
+        if args.rank == 0:
+            print('> initializing torch distributed ...', flush=True)
+        # Manually set the device ids.
+        if device_count > 0:
+            device = args.rank % device_count
+            if args.local_rank is not None:
+                assert args.local_rank == device, \
+                    'expected local-rank to be the same as rank % device-count.'
+            else:
+                args.local_rank = device
+            torch.cuda.set_device(device)
+        # Call the init process
+        init_method = 'tcp://'
+        master_ip = os.getenv('MASTER_ADDR', 'localhost')
+        master_port = os.getenv('MASTER_PORT', '6000')
+        init_method += master_ip + ':' + master_port
+        torch.distributed.init_process_group(
+            backend=args.distributed_backend,
+            world_size=args.world_size, rank=args.rank,
+            init_method=init_method)
+
+    # Setup 3D topology.
+    if args.pipe_parallel_size > 0:
+        pp = args.pipe_parallel_size
+        mp = args.model_parallel_size
+        assert args.world_size % (pp * mp) == 0
+        dp = args.world_size // (pp * mp)
+
+        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+        topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
+
+        # Offset base seeds for the interior pipeline stages.
+        # TODO: adjust last stage too once IO is improved.
+        stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
+        if 0 < stage_id < topo.get_dim('pipe') - 1:
+            offset = args.seed + 1138
+            args.seed = offset + (stage_id * mp)
+    else:
+        topo = None
+
+
+    # Set the model-parallel / data-parallel communicators.
+    if device_count > 0:
+        if mpu.model_parallel_is_initialized():
+            print('model parallel is already initialized')
+        else:
+            mpu.initialize_model_parallel(args.model_parallel_size, topology=topo)
+
+    # Optional DeepSpeed Activation Checkpointing Features
+    #
+    if args.deepspeed and args.deepspeed_activation_checkpointing:
+        setup_deepspeed_random_and_activation_checkpointing(args)
+
+def _init_autoresume():
+    """Set autoresume start time."""
+    autoresume = get_adlr_autoresume()
+    if autoresume:
+        torch.distributed.barrier()
+        autoresume.init()
+        torch.distributed.barrier()
+
+
+def _set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            mpu.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
+
+
+def _write_args_to_tensorboard():
+    """Write arguments to tensorboard."""
+    args = get_args()
+    writer = get_tensorboard_writer()
+    if writer:
+        for arg in vars(args):
+            writer.add_text(arg, str(getattr(args, arg)))
+
+
+def _initialize_mem_buffs():
+    """Initialize manually allocated static memory."""
+    args = get_args()
+
+    # Initialize memory for checkpointed activations.
+    if args.distribute_checkpointed_activations:
+        mpu.init_checkpointed_activations_memory_buffer()
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
new file mode 100644
index 00000000..1a449be8
--- /dev/null
+++ b/megatron/learning_rates.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate decay functions."""
+
+import math
+
+from megatron import print_rank_0
+
+
+class AnnealingLR(object):
+    """Anneals the learning rate."""
+
+    def __init__(self, optimizer, start_lr,
+                 warmup_iter, total_iters,
+                 decay_style, last_iter, min_lr=0.0,
+                 use_checkpoint_lr_scheduler=True,
+                 override_lr_scheduler=False):
+
+        # Class values.
+        self.optimizer = optimizer
+        self.start_lr = start_lr
+        self.min_lr = min_lr
+        self.warmup_iter = warmup_iter
+        self.num_iters = last_iter
+        self.end_iter = total_iters
+        assert self.end_iter > 0
+        self.decay_style = decay_style
+        self.override_lr_scheduler = override_lr_scheduler
+        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
+        if self.override_lr_scheduler:
+            assert not self.use_checkpoint_lr_scheduler, 'both override and '\
+                'use-checkpoint are set.'
+        # Set the learning rate
+        self.step(self.num_iters)
+
+        print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
+
+    def get_lr(self):
+        """Learning rate decay functions from:
+              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+
+        num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
+        # Warmup.
+        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
+            return float(self.start_lr) * num_iters_ / self.warmup_iter
+
+        num_iters_ = num_iters_ - self.warmup_iter
+        if self.decay_style == 'linear':
+            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+        elif self.decay_style == 'cosine':
+            lr = self.start_lr / 2.0 * (math.cos(
+                math.pi * num_iters_ / self.end_iter) + 1)
+        elif self.decay_style == 'exponential':
+            # exp(-0.693) = 1/2
+            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
+        else:
+            lr = self.start_lr
+        return max(lr, self.min_lr)
+
+    def step(self, step_num=None):
+        """Set lr for all parameters groups."""
+        if step_num is None:
+            step_num = self.num_iters + 1
+        self.num_iters = step_num
+        new_lr = self.get_lr()
+        for group in self.optimizer.param_groups:
+            group['lr'] = new_lr
+
+    def state_dict(self):
+        state_dict = {
+            'start_lr': self.start_lr,
+            'warmup_iter': self.warmup_iter,
+            'num_iters': self.num_iters,
+            'decay_style': self.decay_style,
+            'end_iter': self.end_iter,
+            'min_lr': self.min_lr
+        }
+        return state_dict
+
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
+        if self.override_lr_scheduler:
+            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
+            return cls_value
+
+        if not self.use_checkpoint_lr_scheduler:
+            assert cls_value == sd_value, 'AnnealingLR: class input value' \
+                'and checkpoint values for {} do not match'.format(name)
+        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
+                                                                  name))
+        return sd_value
+
+    def load_state_dict(self, sd):
+
+        self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
+                                            'learning rate')
+        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
+                                          'minimum learning rate')
+        self.warmup_iter = self._check_and_set(self.warmup_iter,
+                                               sd['warmup_iter'],
+                                               'warmup iterations')
+        self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
+                                            'total number of iterations')
+        self.decay_style = self._check_and_set(self.decay_style,
+                                               sd['decay_style'],
+                                               'decay style')
+
+        self.num_iters = sd['num_iters']
+        self.step(self.num_iters)
diff --git a/megatron/memory.py b/megatron/memory.py
new file mode 100644
index 00000000..be5a117b
--- /dev/null
+++ b/megatron/memory.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+
+# A dictionary of all the memory buffers allocated.
+_MEM_BUFFS = dict()
+
+
+def allocate_mem_buff(name, numel, dtype, track_usage):
+    """Allocate a memory buffer."""
+    assert name not in _MEM_BUFFS, \
+        'memory buffer {} already allocated.'.format(name)
+    _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
+    return _MEM_BUFFS[name]
+
+
+def get_mem_buff(name):
+    """Get the memory buffer."""
+    return _MEM_BUFFS[name]
+
+
+class MemoryBuffer:
+    """Contiguous memory buffer.
+    Allocate a contiguous memory of type `dtype` and size `numel`. It is
+    used to reduce memory fragmentation.
+
+    Usage: After the allocation, the `_start` index is set tot the first
+           index of the memory. A memory chunk starting from `_start` index
+           can be `allocated` for an input tensor, with the elements of the
+           tensor being coppied. The buffer can be reused by resetting the
+           `_start` index.
+
+    """
+    def __init__(self, name, numel, dtype, track_usage):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=dtype).element_size()
+            print('> building the {} memory buffer with {} num elements '
+                  'and {} dtype ({:.1f} MB)...'.format(
+                      name, numel, dtype, numel*element_size/1024/1024),
+                  flush=True)
+        self.name = name
+        self.numel = numel
+        self.dtype = dtype
+        self.data = torch.empty(self.numel,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+
+        # Index tracking the start of the free memory.
+        self._start = 0
+
+        # Values used for tracking usage.
+        self.track_usage = track_usage
+        if self.track_usage:
+            self.in_use_value = 0.0
+            self.total_value = 0.0
+
+
+    def reset(self):
+        """Reset the buffer start index to the beginning of the buffer."""
+        self._start = 0
+
+
+    def is_in_use(self):
+        """Whether the current buffer hold on to any memory."""
+        return self._start > 0
+
+
+    def numel_in_use(self):
+        """Return number of elements in use."""
+        return self._start
+
+
+    def add(self, tensor):
+        """Allocate a chunk of memory from the buffer to tensor and copy
+        the values."""
+        assert tensor.dtype == self.dtype, \
+            'Input tensor type {} different from buffer type {}'.format(
+                tensor.dtype, self.dtype)
+        # Number of elements of the input tensor.
+        tensor_numel = torch.numel(tensor)
+        new_start = self._start + tensor_numel
+        assert new_start <= self.numel, \
+            'Not enough memory left in the buffer ({} > {})'.format(
+                tensor_numel, self.numel - self._start)
+        # New tensor is a view into the memory.
+        new_tensor = self.data[self._start:new_start]
+        self._start = new_start
+        new_tensor = new_tensor.view(tensor.shape)
+        new_tensor.copy_(tensor)
+        # Return a pointer to the new tensor.
+        return new_tensor
+
+
+    def get_data(self):
+        """Return the data currently in use."""
+        if self.track_usage:
+            self.in_use_value += float(self._start)
+            self.total_value += float(self.numel)
+        return self.data[:self._start]
+
+
+    def print_average_usage(self):
+        """Print memory usage average over time. We would like this value
+        to be as high as possible."""
+        assert self.track_usage, 'You need to enable track usage.'
+        if torch.distributed.get_rank() == 0:
+            print(' > usage of {} memory buffer: {:.2f} %'.format(
+                self.name, self.in_use_value * 100.0 / self.total_value),
+                  flush=True)
+
+
+
+class RingMemBuffer:
+    """A ring of memory buffers."""
+
+    def __init__(self, name, num_buffers, numel, dtype, track_usage):
+        self.num_buffers = num_buffers
+        self.buffers = [
+            allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
+            for i in range(num_buffers)]
+        self._index = -1
+
+
+    def get_next_buffer(self):
+        self._index += 1
+        self._index = self._index % self.num_buffers
+        buff = self.buffers[self._index]
+        assert not buff.is_in_use(), 'buffer is already in use.'
+        return buff
diff --git a/optim/__init__.py b/megatron/model/__init__.py
similarity index 58%
rename from optim/__init__.py
rename to megatron/model/__init__.py
index d8049725..0ae0c46f 100755
--- a/optim/__init__.py
+++ b/megatron/model/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,14 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from torch.optim import SGD
-from torch.optim import Adadelta
-from torch.optim import Adagrad
-from torch.optim import SparseAdam
-from torch.optim import Adamax
-from torch.optim import SGD
-from torch.optim import Rprop
-from torch.optim import RMSprop
-from torch.optim import Optimizer
-from torch.optim import LBFGS
-from .adam import Adam
+from .distributed import *
+from .bert_model import BertModel
+from .realm_model import ICTBertModel
+from .gpt2_model import GPT2Model, GPT2ModelPipe
+from .utils import get_params_for_weight_decay_optimization
+from .language_model import get_language_model
diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py
new file mode 100644
index 00000000..1efb95e3
--- /dev/null
+++ b/megatron/model/bert_model.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT model."""
+
+import torch
+
+from megatron import get_args
+from megatron import mpu
+from megatron.model.language_model import parallel_lm_logits
+from megatron.model.language_model import get_language_model
+from megatron.model.transformer import LayerNorm
+from megatron.model.utils import openai_gelu, erf_gelu
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from megatron.module import MegatronModule
+
+def bert_attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+def bert_extended_attention_mask(attention_mask):
+    # We create a 3D attention mask from a 2D tensor mask.
+    # [b, 1, s]
+    attention_mask_b1s = attention_mask.unsqueeze(1)
+    # [b, s, 1]
+    attention_mask_bs1 = attention_mask.unsqueeze(2)
+    # [b, s, s]
+    attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+    # [b, 1, s, s]
+    extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+    # Convert attention mask to binary:
+    extended_attention_mask = (extended_attention_mask < 0.5)
+
+    return extended_attention_mask
+
+def bert_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+    return position_ids
+
+
+class BertLMHead(MegatronModule):
+    """Masked LM head for Bert
+
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        init_method: init method for weight initialization
+        layernorm_epsilon: tolerance for layer norm divisions
+        parallel_output: whether output logits being distributed or not.
+    """
+
+    def __init__(self, mpu_vocab_size, hidden_size, init_method,
+                 layernorm_epsilon, parallel_output):
+
+        super(BertLMHead, self).__init__()
+
+        args = get_args()
+
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        self.bias.model_parallel = True
+        self.bias.partition_dim = 0
+        self.bias.stride = 1
+        self.parallel_output = parallel_output
+
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+        self.gelu = torch.nn.functional.gelu
+        if args.openai_gelu:
+            self.gelu = openai_gelu
+        elif args.onnx_safe:
+            self.gelu = erf_gelu
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.layernorm(hidden_states)
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+
+
+class BertModel(MegatronModule):
+    """Bert Language model."""
+
+    def __init__(self, num_tokentypes=2, add_binary_head=True,
+                 parallel_output=True):
+        super(BertModel, self).__init__()
+        args = get_args()
+
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.add_binary_head = add_binary_head
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=bert_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=self.add_binary_head,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+
+        self.lm_head = BertLMHead(
+            self.language_model.embedding.word_embeddings.weight.size(0),
+            args.hidden_size, init_method, args.layernorm_epsilon, parallel_output)
+        self._lm_head_key = 'lm_head'
+        if self.add_binary_head:
+            self.binary_head = get_linear_layer(args.hidden_size, 2,
+                                                init_method)
+            self._binary_head_key = 'binary_head'
+
+    def forward(self, input_ids, attention_mask,
+                tokentype_ids=None, lm_labels=None):
+
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        position_ids = bert_position_ids(input_ids)
+
+        if self.add_binary_head:
+            lm_output, pooled_output = self.language_model(
+                input_ids,
+                position_ids,
+                extended_attention_mask,
+                tokentype_ids=tokentype_ids)
+        else:
+            lm_output = self.language_model(
+                input_ids,
+                position_ids,
+                extended_attention_mask,
+                tokentype_ids=tokentype_ids)
+
+        # Output.
+        lm_logits = self.lm_head(
+            lm_output, self.language_model.embedding.word_embeddings.weight)
+
+        binary_logits = None
+        if self.add_binary_head:
+            binary_logits = self.binary_head(pooled_output)
+
+        if lm_labels is None:
+            return lm_logits, binary_logits
+        else:
+            if self.fp16_lm_cross_entropy:
+                assert lm_logits.dtype == torch.half
+                lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+            else:
+                lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                           lm_labels)
+            return lm_loss, binary_logits
+
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._lm_head_key] \
+            = self.lm_head.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        if self.add_binary_head:
+            state_dict_[self._binary_head_key] \
+                = self.binary_head.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.lm_head.load_state_dict(
+            state_dict[self._lm_head_key], strict=strict)
+        if self.add_binary_head:
+            self.binary_head.load_state_dict(
+                state_dict[self._binary_head_key], strict=strict)
diff --git a/megatron/model/classification.py b/megatron/model/classification.py
new file mode 100644
index 00000000..5c69d95c
--- /dev/null
+++ b/megatron/model/classification.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classification model."""
+
+import torch
+
+from megatron import get_args, print_rank_0
+from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from megatron.module import MegatronModule
+
+
+class Classification(MegatronModule):
+
+    def __init__(self, num_classes, num_tokentypes=2):
+        super(Classification, self).__init__()
+        args = get_args()
+
+        self.num_classes = num_classes
+        init_method = init_method_normal(args.init_method_std)
+
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=bert_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers))
+
+        # Multi-choice head.
+        self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
+        self.classification_head = get_linear_layer(args.hidden_size,
+                                                    self.num_classes,
+                                                    init_method)
+        self._classification_head_key = 'classification_head'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids):
+
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+
+        _, pooled_output = self.language_model(input_ids,
+                                               position_ids,
+                                               extended_attention_mask,
+                                               tokentype_ids=tokentype_ids)
+
+        # Output.
+        classification_output = self.classification_dropout(pooled_output)
+        classification_logits = self.classification_head(classification_output)
+
+        # Reshape back to separate choices.
+        classification_logits = classification_logits.view(-1, self.num_classes)
+
+        return classification_logits
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        state_dict_[self._classification_head_key] \
+            = self.classification_head.state_dict(
+                destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self._classification_head_key in state_dict:
+            self.classification_head.load_state_dict(
+                state_dict[self._classification_head_key], strict=strict)
+        else:
+            print_rank_0('***WARNING*** could not find {} in the checkpoint, '
+                         'initializing to random'.format(
+                             self._classification_head_key))
diff --git a/model/distributed.py b/megatron/model/distributed.py
similarity index 82%
rename from model/distributed.py
rename to megatron/model/distributed.py
index d08c1e99..d49cb964 100755
--- a/model/distributed.py
+++ b/megatron/model/distributed.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,18 +19,18 @@
 from torch.nn.modules import Module
 from torch.autograd import Variable
 
+from megatron import mpu
+from megatron.module import MegatronModule
 
-class DistributedDataParallel(Module):
+
+class DistributedDataParallel(MegatronModule):
 
     def __init__(self, module):
         super(DistributedDataParallel, self).__init__()
         self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
 
         self.module = module
-
-        for p in self.module.parameters():
-            if torch.is_tensor(p):
-                dist.broadcast(p, 0)
+        self.data_parallel_group = mpu.get_data_parallel_group()
 
         def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
             if(self.needs_reduction):
@@ -54,11 +54,11 @@ def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
                     if fp32_allreduce:
                         coalesced = coalesced.float()
                     if not no_scale and not reduce_after:
-                        coalesced /= dist.get_world_size()
-                    dist.all_reduce(coalesced)
+                        coalesced /= dist.get_world_size(group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
                     torch.cuda.synchronize()
                     if not no_scale and reduce_after:
-                        coalesced /= dist.get_world_size()
+                        coalesced /= dist.get_world_size(group=self.data_parallel_group)
                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                         buf.copy_(synced)
         self.hook_handles = []
@@ -67,8 +67,8 @@ def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
             def allreduce_hook(*unused):
                 Variable._execution_engine.queue_callback(allreduce_params)
         #    handle = param.register_hook(allreduce_hook)
-            #self.hooks.append(allreduce_hook)
-            #self.hook_handles.append(handle)
+            # self.hooks.append(allreduce_hook)
+            # self.hook_handles.append(handle)
         self.allreduce_params = allreduce_params
 
     def forward(self, *inputs, **kwargs):
@@ -84,6 +84,11 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
 
         return sd
 
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+
     def load_state_dict(self, state_dict, strict=True):
         self.module.load_state_dict(state_dict, strict=strict)
 
@@ -105,4 +110,3 @@ def train(self, mode=True):
         super(DistributedDataParallel, self).train(mode)
         self.module.train(mode)
     '''
-
diff --git a/megatron/model/fused_bias_gelu.py b/megatron/model/fused_bias_gelu.py
new file mode 100644
index 00000000..8e17a30e
--- /dev/null
+++ b/megatron/model/fused_bias_gelu.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+bias_gelu_impl = GeLUFunction.apply
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
new file mode 100644
index 00000000..d5cf992a
--- /dev/null
+++ b/megatron/model/fused_softmax.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function) :
+    """
+       Fused operation which performs following three operations in sequence
+       1. Scale the tensor. 
+       2. Apply upper triangular mask (typically used in gpt models).
+       3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+
+        softmax_results =  \
+            scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads =   \
+            scaled_upper_triang_masked_softmax_cuda.backward(output_grads,                             
+                                                 softmax_results,                          
+                                                 scale_t[0])
+        return input_grads, None
+
+class ScaledMaskedSoftmax(torch.autograd.Function) :
+    """
+       Fused operation which performs following three operations in sequence
+       1. Scale the tensor. 
+       2. Apply the mask.
+       3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+
+        softmax_results =  \
+            scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads =   \
+            scaled_masked_softmax_cuda.backward(output_grads,
+                                                softmax_results,
+                                                scale_t[0])
+        return input_grads, None, None
+
+class FusedScaleMaskSoftmax(torch.nn.Module):
+    """
+       fused operation: scaling + mask + softmax
+       Arguments:
+           input_in_fp16: flag to indicate if input in fp16 data format.
+           upper_triang_mask: if true, apply upper triangular masking.
+                              (used in gpt family networks)
+           mask_func: mask function to be applied.
+           softmax_in_fp32: if true, softmax in performed at fp32 precision.
+           scale: scaling factor used in input tensor scaling.
+
+    """
+    def __init__(self, input_in_fp16, upper_triang_mask_fusion, 
+                 general_mask_fusion, mask_func, softmax_in_fp32, scale):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.upper_triang_mask_fusion = upper_triang_mask_fusion
+        self.general_mask_fusion = general_mask_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert self.scale is None or softmax_in_fp32, \
+            'softmax should be in fp32 when scaled'
+
+    def forward(self, input, mask):
+        # [b, np, s, s]
+        data_size = input.size()
+        assert input.dim() == 4 
+
+        # invoke custom kernel
+        if self.input_in_fp16 and data_size[-1] <= 2048 and \
+            (self.upper_triang_mask_fusion or self.general_mask_fusion) and \
+            input.size()[2] == input.size()[3]:
+            scale = self.scale if self.scale is not None  else 1.0
+            if self.upper_triang_mask_fusion:
+                input = input.view(-1, data_size[2], data_size[3])
+                probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+                probs = probs.view(*data_size)
+            else:
+                probs = ScaledMaskedSoftmax.apply(input, mask, scale)
+        else:
+            if self.input_in_fp16 and self.softmax_in_fp32:
+                input = input.float()
+
+            if self.scale is not None:
+                input = input * self.scale
+            mask_output = self.mask_func(input, mask)
+            probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+            if self.input_in_fp16 and self.softmax_in_fp32:
+                probs = probs.half()
+
+        return probs
diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
new file mode 100644
index 00000000..55f0d01c
--- /dev/null
+++ b/megatron/model/gpt2_model.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT-2 model."""
+
+import torch
+
+from megatron import get_args
+from megatron import mpu
+from megatron.module import MegatronModule
+
+from .language_model import parallel_lm_logits
+from .language_model import get_language_model
+from .utils import init_method_normal
+from .utils import scaled_init_method_normal
+
+# Pipeline parallelism
+from megatron import mpu
+import torch.nn.functional as F
+import torch.nn.functional as F
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+import megatron.fp16 as fp16
+from megatron.model.transformer import ParallelTransformerLayerPipe
+from .language_model import EmbeddingPipe
+
+from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
+
+
+def gpt2_attention_mask_func(attention_scores, ltor_mask):
+    attention_scores.masked_fill_(ltor_mask, -10000.0)
+    return attention_scores
+
+
+def CrossEntropy(output, labels):
+    """ From pretrain_gpt2:forward_step() """
+    labels, loss_mask = labels[0], labels[1]
+
+    losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(), labels)
+    loss_mask = loss_mask.view(-1)
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+    return loss
+
+
+
+class GPT2Model(MegatronModule):
+    """GPT-2 Language model."""
+
+    def __init__(self, num_tokentypes=0, parallel_output=True):
+        super(GPT2Model, self).__init__()
+        args = get_args()
+
+        self.parallel_output = parallel_output
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=gpt2_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            init_method=init_method_normal(args.init_method_std),
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers))
+
+    def forward(self, input_ids, position_ids, attention_mask, labels=None,
+                tokentype_ids=None, layer_past=None, get_key_value=False,
+                forward_method_parallel_output=None):
+
+        # Language model.
+        lm_output = self.language_model(input_ids,
+                                        position_ids,
+                                        attention_mask,
+                                        tokentype_ids=tokentype_ids,
+                                        layer_past=layer_past,
+                                        get_key_value=get_key_value)
+
+        if get_key_value:
+            lm_output, presents = lm_output
+
+        # Output.
+        parallel_output = self.parallel_output
+        if forward_method_parallel_output is not None:
+            parallel_output = forward_method_parallel_output
+        output = parallel_lm_logits(
+            lm_output,
+            self.language_model.embedding.word_embeddings.weight,
+            parallel_output)
+
+        if get_key_value:
+            output = [output, presents]
+
+        if labels is None:
+            return output
+        else:
+            if self.fp16_lm_cross_entropy:
+                assert output.dtype == torch.half
+                loss = mpu.vocab_parallel_cross_entropy(output, labels)
+            else:
+                loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+            return loss
+
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        if self._language_model_key in state_dict:
+            state_dict = state_dict[self._language_model_key]
+        self.language_model.load_state_dict(state_dict, strict=strict)
+
+
+class GPT2ModelPipe(PipelineModule,MegatronModule):
+    """GPT2Model adapted for pipeline parallelism.
+
+    The largest change is flattening the GPTModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+    """
+
+    def __init__(self, num_tokentypes=0, parallel_output=True, add_pooler=False, topology=None):
+        args = get_args()
+
+        self.parallel_output = parallel_output
+        self.hidden_size = args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = init_method_normal(args.init_method_std)
+        self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)
+        self.add_pooler = add_pooler
+        if self.add_pooler:
+            raise NotImplementedError('Pipeline pooler not yet implemented. Forward needs pooling_sequence_index')
+        
+        # Use torch gelu unless otherwise forced.
+        gelu = F.gelu
+        if args.openai_gelu:
+            gelu = openai_gelu
+
+        #
+        # forward() prototype
+        # 
+        self.specs = []
+
+        # Embedding layer
+        self.specs.append(TiedLayerSpec('embed',
+                                        EmbeddingPipe,
+                                        self.hidden_size,
+                                        args.padded_vocab_size,
+                                        args.max_position_embeddings,
+                                        args.hidden_dropout,
+                                        self.init_method,
+                                        self.num_tokentypes,
+                                        tied_weight_attr='word_embeddings_weight'))
+
+        # outputs are now (hidden_states, attention_mask)
+
+        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
+        self.specs.append(lambda x: (x[0].transpose(0,1).contiguous(), x[1]))
+
+        # Transformer layers
+        for x in range(args.num_layers):
+            self.specs.append(
+                LayerSpec(ParallelTransformerLayerPipe,
+                          attention_mask_func=gpt2_attention_mask_func,
+                          init_method=self.init_method,
+                          output_layer_init_method=self.output_layer_init_method,
+                          layer_number=x))
+        # Undo data format change and drop mask
+        self.specs.append(lambda x: x[0].transpose(0,1).contiguous())
+
+
+        # Final layernorm after transformer layers
+        self.specs.append(
+            LayerSpec(LayerNorm,
+                      args.hidden_size,
+                      eps=args.layernorm_epsilon))
+
+        # XXX forward_method_parallel_output is assumed to be None, but we're not in a
+        # fwd method to assert
+
+        def _logits_helper(embedding, lm_output):
+            """Just a wrapper to massage inputs/outputs from pipeline. """
+            return parallel_lm_logits(
+                lm_output,
+                embedding.word_embeddings_weight,
+                self.parallel_output)
+
+        self.specs.append(
+            TiedLayerSpec('embed',
+                          EmbeddingPipe,
+                          self.hidden_size,
+                          args.padded_vocab_size,
+                          args.max_position_embeddings,
+                          args.hidden_dropout,
+                          self.init_method,
+                          self.num_tokentypes,
+                          forward_fn=_logits_helper,
+                          tied_weight_attr='word_embeddings_weight')
+        )
+
+        # Should maybe be done in loss_fn() instead?
+        if args.fp16:
+            self.specs.append(fp16.fp16_to_fp32)
+
+        if args.checkpoint_activations:
+            interval = args.checkpoint_num_layers
+        else:
+            interval = 0
+        super().__init__(layers=self.specs,
+                         loss_fn=CrossEntropy,
+                         topology=topology,
+                         activation_checkpoint_interval=interval,
+                         partition_method='type:transformer')
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
new file mode 100644
index 00000000..2d8d8f59
--- /dev/null
+++ b/megatron/model/language_model.py
@@ -0,0 +1,384 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer based language model."""
+
+import torch
+import torch.nn.functional as F
+
+from megatron import get_args
+from megatron import mpu
+from megatron.module import MegatronModule
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal, scaled_init_method_normal
+
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    # Parallel logits.
+    input_parallel = mpu.copy_to_model_parallel_region(input_)
+    # Matrix multiply.
+    if bias is None:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    else:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+    # Gather if needed.
+    if parallel_output:
+        return logits_parallel
+
+    return mpu.gather_from_model_parallel_region(logits_parallel)
+
+
+def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
+                       init_method=None, scaled_init_method=None):
+    """Build language model and return along with the key to save."""
+    args = get_args()
+
+    if init_method is None:
+        init_method = init_method_normal(args.init_method_std)
+
+    if scaled_init_method is None:
+        scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)
+
+    # Language model.
+    language_model = TransformerLanguageModel(
+        attention_mask_func=attention_mask_func,
+        init_method=init_method,
+        output_layer_init_method=scaled_init_method,
+        num_tokentypes=num_tokentypes,
+        add_pooler=add_pooler)
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+
+    return language_model, language_model_key
+
+
+class Pooler(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super(Pooler, self).__init__()
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [b, s, h]
+        # sequence_index: index of the token to pool.
+        pooled = hidden_states[:, sequence_index, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
+
+
+class Embedding(MegatronModule):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 init_method,
+                 num_tokentypes=0):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, self.hidden_size, init_method=self.init_method)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(
+            max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+                                                           self.hidden_size)
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
+                                                       self.hidden_size)
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+
+class EmbeddingPipe(Embedding):
+    """Extends Embedding to forward attention_mask through the pipeline."""
+    @property
+    def word_embeddings_weight(self):
+        """Easy accessory for the pipeline engine to tie embeddings across stages."""
+        return self.word_embeddings.weight
+    
+    def forward(self, args):
+        input_ids = args[0]
+        position_ids = args[1]
+        attention_mask = args[2]
+        if len(args) == 4:
+            tokentype_ids = args[3]
+        else:
+            tokentype_ids = None
+        
+        embeddings = super().forward(input_ids, position_ids, tokentype_ids=tokentype_ids)
+        return embeddings, attention_mask
+
+class TransformerLanguageModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        attention_mask_func: a function that takes `unmaksed-attention-scores`
+            with size [b, np, s, s] and an `attention-mask` and will apply
+            the masking. The function should return a masked score of the
+            same size [b, np, s, s].
+          masked-attention-scores = attention_mask_func(
+                                     unmaksed-attention-scores, attention-mask)
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 attention_mask_func,
+                 init_method,
+                 output_layer_init_method,
+                 num_tokentypes=0,
+                 add_pooler=False):
+        super(TransformerLanguageModel, self).__init__()
+        args = get_args()
+
+        self.hidden_size = args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = init_method
+        self.add_pooler = add_pooler
+
+        # Embeddings
+        self.embedding = Embedding(self.hidden_size,
+                                   args.padded_vocab_size,
+                                   args.max_position_embeddings,
+                                   args.hidden_dropout,
+                                   self.init_method,
+                                   self.num_tokentypes)
+        self._embedding_key = 'embedding'
+
+        # Transformer
+        self.transformer = ParallelTransformer(
+            attention_mask_func, self.init_method, 
+            output_layer_init_method)
+        self._transformer_key = 'transformer'
+
+        # Pooler
+        if self.add_pooler:
+            self.pooler = Pooler(self.hidden_size, self.init_method)
+            self._pooler_key = 'pooler'
+
+    def forward(self, input_ids, position_ids, attention_mask,
+                tokentype_ids=None, layer_past=None, get_key_value=False,
+                pooling_sequence_index=0):
+
+        # Embeddings.
+        embedding_output = self.embedding(input_ids, position_ids,
+                                          tokentype_ids=tokentype_ids)
+
+        # Transformer.
+        transformer_output = self.transformer(embedding_output,
+                                              attention_mask,
+                                              layer_past=layer_past,
+                                              get_key_value=get_key_value)
+
+        if self.add_pooler:
+            pooled_output = self.pooler(transformer_output,
+                                        pooling_sequence_index)
+            return transformer_output, pooled_output
+
+        return transformer_output
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._embedding_key] \
+            = self.embedding.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        state_dict_[self._transformer_key] \
+            = self.transformer.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.add_pooler:
+            state_dict_[self._pooler_key] \
+                = self.pooler.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Embedding.
+        if self._embedding_key in state_dict:
+            state_dict_ = state_dict[self._embedding_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if '_embeddings' in key:
+                    state_dict_[key] = state_dict[key]
+        self.embedding.load_state_dict(state_dict_, strict=strict)
+
+        # Transformer.
+        if self._transformer_key in state_dict:
+            state_dict_ = state_dict[self._transformer_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'transformer.' in key:
+                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
+        self.transformer.load_state_dict(state_dict_, strict=strict)
+
+        # Pooler.
+        if self.add_pooler:
+            assert 'pooler' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                        strict=strict)
diff --git a/megatron/model/multiple_choice.py b/megatron/model/multiple_choice.py
new file mode 100644
index 00000000..97de025a
--- /dev/null
+++ b/megatron/model/multiple_choice.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multiple choice model."""
+
+import torch
+
+from megatron import get_args, print_rank_0
+from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from megatron.module import MegatronModule
+
+
+class MultipleChoice(MegatronModule):
+
+    def __init__(self, num_tokentypes=2):
+        super(MultipleChoice, self).__init__()
+        args = get_args()
+
+        init_method = init_method_normal(args.init_method_std)
+
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=bert_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers))
+
+        # Multi-choice head.
+        self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
+        self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                 init_method)
+        self._multichoice_head_key = 'multichoice_head'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids):
+
+        # [batch, choices, sequence] --> [batch * choices, sequence] -->
+        #    transformer --> [batch, choices] --> softmax
+
+        # Ensure the shape is [batch-size, choices, sequence]
+        assert len(input_ids.shape) == 3
+        assert len(attention_mask.shape) == 3
+        assert len(tokentype_ids.shape) == 3
+
+        # Reshape and treat choice dimension the same as batch.
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
+
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+
+        _, pooled_output = self.language_model(input_ids,
+                                               position_ids,
+                                               extended_attention_mask,
+                                               tokentype_ids=tokentype_ids)
+
+        # Output.
+        multichoice_output = self.multichoice_dropout(pooled_output)
+        multichoice_logits = self.multichoice_head(multichoice_output)
+
+        # Reshape back to separate choices.
+        multichoice_logits = multichoice_logits.view(-1, num_choices)
+
+        return multichoice_logits
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        state_dict_[self._multichoice_head_key] \
+            = self.multichoice_head.state_dict(
+                destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self._multichoice_head_key in state_dict:
+            self.multichoice_head.load_state_dict(
+                state_dict[self._multichoice_head_key], strict=strict)
+        else:
+            print_rank_0('***WARNING*** could not find {} in the checkpoint, '
+                         'initializing to random'.format(
+                             self._multichoice_head_key))
diff --git a/megatron/model/realm_model.py b/megatron/model/realm_model.py
new file mode 100644
index 00000000..74bc5cf0
--- /dev/null
+++ b/megatron/model/realm_model.py
@@ -0,0 +1,204 @@
+import os
+import torch
+
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.model import BertModel
+from megatron.module import MegatronModule
+from megatron import mpu
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import scaled_init_method_normal
+from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids
+
+
+def general_ict_model_provider(only_query_model=False, only_block_model=False):
+    """Build the model."""
+    args = get_args()
+    assert args.ict_head_size is not None, \
+        "Need to specify --ict-head-size to provide an ICTBertModel"
+
+    assert args.model_parallel_size == 1, \
+        "Model parallel size > 1 not supported for ICT"
+
+    print_rank_0('building ICTBertModel...')
+
+    # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
+    model = ICTBertModel(
+        ict_head_size=args.ict_head_size,
+        num_tokentypes=2,
+        parallel_output=True,
+        only_query_model=only_query_model,
+        only_block_model=only_block_model)
+
+    return model
+
+
+class ICTBertModel(MegatronModule):
+    """Bert-based module for Inverse Cloze task."""
+    def __init__(self,
+                 ict_head_size,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_block_model=False):
+        super(ICTBertModel, self).__init__()
+        bert_kwargs = dict(
+            ict_head_size=ict_head_size,
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output
+        )
+        assert not (only_block_model and only_query_model)
+        self.use_block_model = not only_query_model
+        self.use_query_model = not only_block_model
+
+        if self.use_query_model:
+            # this model embeds (pseudo-)queries - Embed_input in the paper
+            self.query_model = IREncoderBertModel(**bert_kwargs)
+            self._query_key = 'question_model'
+
+        if self.use_block_model:
+            # this model embeds evidence blocks - Embed_doc in the paper
+            self.block_model = IREncoderBertModel(**bert_kwargs)
+            self._block_key = 'context_model'
+
+    def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask):
+        """Run a forward pass for each of the models and return the respective embeddings."""
+        query_logits = self.embed_query(query_tokens, query_attention_mask)
+        block_logits = self.embed_block(block_tokens, block_attention_mask)
+        return query_logits, block_logits
+
+    def embed_query(self, query_tokens, query_attention_mask):
+        """Embed a batch of tokens using the query model"""
+        if self.use_query_model:
+            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
+            query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
+            return query_ict_logits
+        else:
+            raise ValueError("Cannot embed query without query model.")
+
+    def embed_block(self, block_tokens, block_attention_mask):
+        """Embed a batch of tokens using the block model"""
+        if self.use_block_model:
+            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
+            block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
+            return block_ict_logits
+        else:
+            raise ValueError("Cannot embed block without block model.")
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.use_query_model:
+            state_dict_[self._query_key] \
+                = self.query_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+
+        if self.use_block_model:
+            state_dict_[self._block_key] \
+                = self.block_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.use_query_model:
+            print("Loading ICT query model", flush=True)
+            self.query_model.load_state_dict(
+                state_dict[self._query_key], strict=strict)
+
+        if self.use_block_model:
+            print("Loading ICT block model", flush=True)
+            self.block_model.load_state_dict(
+                state_dict[self._block_key], strict=strict)
+
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
+        args = get_args()
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT load for ICT")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except BaseException:
+            raise ValueError("Could not load checkpoint")
+
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        self.query_model.language_model.load_state_dict(model_dict)
+        self.block_model.language_model.load_state_dict(model_dict)
+
+        # give each model the same ict_head to begin with as well
+        query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head']
+        self.block_model.ict_head.load_state_dict(query_ict_head_state_dict)
+
+
+class IREncoderBertModel(MegatronModule):
+    """BERT-based encoder for queries or blocks used for learned information retrieval."""
+    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
+        super(IREncoderBertModel, self).__init__()
+        args = get_args()
+
+        self.ict_head_size = ict_head_size
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            attention_mask_func=bert_attention_mask_func,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+
+        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
+        self._ict_head_key = 'ict_head'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output, pooled_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids)
+
+        # Output.
+        ict_logits = self.ict_head(pooled_output)
+        return ict_logits, None
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._ict_head_key] \
+            = self.ict_head.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.ict_head.load_state_dict(
+            state_dict[self._ict_head_key], strict=strict)
+
+
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
new file mode 100644
index 00000000..2b036874
--- /dev/null
+++ b/megatron/model/transformer.py
@@ -0,0 +1,631 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer."""
+
+import math
+import torch
+import torch.nn.functional as F
+
+from megatron import get_args
+from megatron import mpu
+from megatron.mpu import LayerNorm
+from megatron.module import MegatronModule
+from megatron.checkpointing import get_checkpoint_version
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.fused_bias_gelu import bias_gelu_impl
+from megatron.model.utils import openai_gelu, erf_gelu
+
+import deepspeed
+
+# flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+        attention_mask_func: a function that takes `unmaksed-attention-scores`
+            with size [b, np, s, s] and an `attention-mask` and will apply
+            the masking. The function should return a masked score of the
+            same size [b, np, s, s].
+               masked-attention-scores = attention_mask_func(
+                                     unmaksed-attention-scores, attention-mask)
+"""
+
+class ParallelMLP(MegatronModule):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+    """
+
+    def __init__(self, init_method, output_layer_init_method):
+        super(ParallelMLP, self).__init__()
+        args = get_args()
+
+        # Project to 4h.
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+            args.hidden_size,
+            4 * args.hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True)
+
+        self.bias_gelu_fusion = args.bias_gelu_fusion
+        self.activation_func = F.gelu
+        if args.openai_gelu:
+            self.activation_func = openai_gelu
+        elif args.onnx_safe:
+            self.activation_func = erf_gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = mpu.RowParallelLinear(
+            4 * args.hidden_size,
+            args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+         
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+
+        if self.bias_gelu_fusion:
+            intermediate_parallel = \
+                    bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            intermediate_parallel = \
+                self.activation_func(intermediate_parallel + bias_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+
+
+class ParallelSelfAttention(MegatronModule):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [b, s, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, attention_mask_func, init_method,
+                 output_layer_init_method, layer_number):
+        super(ParallelSelfAttention, self).__init__()
+        args = get_args()
+        self.fp16 = args.fp16
+
+        self.attention_mask_func = attention_mask_func
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(args.hidden_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            args.hidden_size, args.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            args.num_attention_heads, world_size)
+
+        # Strided linear layer.
+        self.query_key_value = mpu.ColumnParallelLinear(
+            args.hidden_size,
+            3 * args.hidden_size,
+            gather_output=False,
+            init_method=init_method)
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16,
+            args.scaled_upper_triang_masked_softmax_fusion,
+            args.scaled_masked_softmax_fusion,
+            self.attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
+
+        # Output.
+        self.dense = mpu.RowParallelLinear(
+            args.hidden_size,
+            args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_last_dim(self, mixed_layer, num_splits, num_splits_first):
+        input_shape = mixed_layer.size();
+        if num_splits_first:
+            """[s, b, num_splits * np * hn] 
+            -->(view) [s, b, num_splits, np, hn] 
+            -->(tranpose) [s, b, np, num_splits, hn] 
+            -->(view) [s, b, np * num_splits * hn] """
+
+            intermediate_shape = input_shape[:-1] +\
+                (num_splits, self.num_attention_heads_per_partition,
+                 self.hidden_size_per_attention_head)
+
+            mixed_layer = mixed_layer.view(*intermediate_shape)
+            mixed_layer = mixed_layer.transpose(-2, -3).contiguous()
+        else:
+            """[s, b, np * hn * num_splits] 
+            -->(view) [s, b, np, hn, num_splits] 
+            -->(tranpose) [s, b, np, num_splits, hn] 
+            -->(view) [s, b, np * num_splits * hn] """
+
+            intermediate_shape = input_shape[:-1] +\
+                (self.num_attention_heads_per_partition,
+                 self.hidden_size_per_attention_head, num_splits)
+
+            mixed_layer = mixed_layer.view(*intermediate_shape)
+            mixed_layer = mixed_layer.transpose(-1, -2).contiguous()
+        mixed_layer = mixed_layer.view(*input_shape)
+        
+        return mixed_layer
+
+    def forward(self, hidden_states, attention_mask, layer_past=None,
+                get_key_value=False):
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        checkpoint_version = get_checkpoint_version()
+        if checkpoint_version is not None:
+           if checkpoint_version == 0:
+               # [s, b, (3 * np * hn)] --> [s, b, (np * 3 * hn)]
+               mixed_x_layer = self._transpose_last_dim(mixed_x_layer, 3, True)
+           elif checkpoint_version == 1.0:
+               # [s, b, (np * hn * 3)] --> [s, b, (np * 3 * hn)]
+               mixed_x_layer = self._transpose_last_dim(mixed_x_layer, 3, False)
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        (query_layer,
+         key_layer,
+         value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key_layer = torch.cat((past_key.type_as(key_layer),
+                                   key_layer), dim=0)
+            value_layer = torch.cat((past_value.type_as(value_layer),
+                                     value_layer), dim=0)
+        if get_key_value:
+            present = (key_layer, value_layer)
+
+
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+        
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1), 
+                       query_layer.size(2), 
+                       query_layer.size(0), 
+                       key_layer.size(0))
+        
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+
+        # preallocting result tensor: [b * np, sq, sk]
+        matmul_result = torch.empty(
+            output_size[0]*output_size[1], 
+            output_size[2], 
+            output_size[3],
+            dtype=query_layer.dtype, 
+            device=torch.cuda.current_device())
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(matmul_result, 
+            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            key_layer.transpose(0,1).transpose(1, 2),  #[b * np, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+
+        # ==================================================
+        # Update attention mask for inference. [b, np, sq, sk]
+        # ==================================================
+
+        if get_key_value:
+            with torch.no_grad():
+                if layer_past is not None:
+                    attention_mask = attention_mask[
+                        ...,
+                        attention_scores.size(3) - 1,
+                        :attention_scores.size(3)].unsqueeze(2)
+                else:
+                    attention_mask = attention_mask[
+                        ...,
+                        :attention_scores.size(3),
+                        :attention_scores.size(3)]
+
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with mpu.get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1), 
+                       value_layer.size(2), 
+                       query_layer.size(0), 
+                       value_layer.size(3)) 
+
+        # change view [sk, b * np, hn] 
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+        
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+        
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0,1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer)
+
+        if get_key_value:
+            output = [output, present]
+
+        return output, bias
+
+
+def bias_dropout_add(x, bias, residual, prob, training) :
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x, bias, residual, prob) :
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x, bias, residual, prob) :
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+class ParallelTransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, attention_mask_func, init_method, 
+                 output_layer_init_method, layer_number):
+        args = get_args()
+
+        super(ParallelTransformerLayer, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm \
+            = args.apply_residual_connection_post_layernorm
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon)
+
+        # Self attention.
+        self.attention = ParallelSelfAttention(attention_mask_func, init_method,
+                                               output_layer_init_method,
+                                               layer_number)
+        self.hidden_dropout = args.hidden_dropout
+        self.bias_dropout_fusion = args.bias_dropout_fusion
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon)
+
+        # MLP
+        self.mlp = ParallelMLP(init_method,
+                               output_layer_init_method)
+
+    def forward(self, hidden_states, attention_mask, layer_past=None,
+                get_key_value=False):
+        # hidden_states: [b, s, h]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, attention_bias = \
+            self.attention(layernorm_output,
+                           attention_mask,
+                           layer_past=layer_past,
+                           get_key_value=get_key_value)
+
+        if get_key_value:
+            attention_output, presents = attention_output
+    
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # jit scripting for a nn.module (with dropout) is not 
+        # trigerring the fusion kernel. For now, we use two 
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if self.bias_dropout_fusion:
+            if self.training:
+                bias_dropout_add_func = bias_dropout_add_fused_train
+            else:
+                bias_dropout_add_func = bias_dropout_add_fused_inference
+        else:
+            bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+        #re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            layernorm_input = bias_dropout_add_func(
+                attention_output,
+                attention_bias.expand_as(residual),
+                residual,
+                self.hidden_dropout)
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+        
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        #re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            output = bias_dropout_add_func(
+                mlp_output,
+                mlp_bias.expand_as(residual),
+                residual,
+                self.hidden_dropout)
+
+        if get_key_value:
+            output = [output, presents]
+
+        return output
+
+class ParallelTransformerLayerPipe(ParallelTransformerLayer):
+    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline. """
+    def forward(self, args):
+        hidden_states, attention_mask = args[0], args[1]
+        return super().forward(*args), attention_mask
+
+class ParallelTransformer(MegatronModule):
+    """Transformer class."""
+
+    def __init__(self, attention_mask_func,
+                 init_method, output_layer_init_method):
+        super(ParallelTransformer, self).__init__()
+        args = get_args()
+
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = args.checkpoint_activations
+        self.checkpoint_num_layers = args.checkpoint_num_layers
+
+        # Number of layers:
+        self.num_layers = args.num_layers
+        self.num_unique_layers = args.num_unique_layers
+        if self.num_unique_layers is None:
+            self.num_unique_layers = self.num_layers
+        assert self.num_layers % self.num_unique_layers == 0, \
+            'number of layers should be divisible by number of unique layers'
+        self.param_sharing_style = args.param_sharing_style
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return ParallelTransformerLayer(
+                attention_mask_func, init_method,
+                output_layer_init_method, layer_number)
+        self.layers = torch.nn.ModuleList(
+            [build_layer(i + 1) for i in range(self.num_unique_layers)])
+
+        # Print layer ordering.
+        if self.num_layers != self.num_unique_layers:
+            if torch.distributed.get_rank() == 0:
+                print('> will be using the following layer ordering:')
+                for i in range(self.num_layers):
+                    print('   layer id: {:3d} --> unique layer id: '
+                          '{:3d}'.format(i, self._get_layer_index(i)),
+                          flush=True)
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _get_layer_index(self, layer_number):
+        if self.param_sharing_style == 'grouped':
+            return layer_number % self.num_unique_layers
+        if self.param_sharing_style == 'spaced':
+            return layer_number // (self.num_layers // self.num_unique_layers) 
+        assert False, 'should not be here'
+
+    def _get_layer(self, layer_number):
+        return self.layers[self._get_layer_index(layer_number)]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask):
+        """Forward method with activation checkpointing."""
+        def custom(start, end):
+            def custom_forward(*inputs):
+                x_ = inputs[0]
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, inputs[1])
+                return x_
+            return custom_forward
+
+        # Make sure memory is freed.
+        mpu.reset_checkpointed_activations_memory_buffer()
+        l = 0
+        while l < self.num_layers:
+            hidden_states = mpu.checkpoint(
+                custom(l, l + self.checkpoint_num_layers),
+                hidden_states, attention_mask)
+            l += self.checkpoint_num_layers
+
+        return hidden_states
+
+    def forward(self, hidden_states, attention_mask, layer_past=None,
+                get_key_value=False):
+
+        # Checks
+        if layer_past is not None:
+            assert get_key_value, \
+                'for not None values in layer_past, ' \
+                'expected get_key_value to be set'
+        if get_key_value:
+            assert not self.checkpoint_activations, \
+                'get_key_value does not work with ' \
+                'activation checkpointing'
+
+        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
+        hidden_states = hidden_states.transpose(0, 1).contiguous()
+
+        if self.checkpoint_activations:
+            hidden_states = self._checkpointed_forward(hidden_states,
+                                                       attention_mask)
+        else:
+            if get_key_value:
+                presents = []
+            for index in range(self.num_layers):
+                layer = self._get_layer(index)
+                past = None
+                if layer_past is not None:
+                    past = layer_past[index]
+                hidden_states = layer(hidden_states,
+                                      attention_mask,
+                                      layer_past=past,
+                                      get_key_value=get_key_value)
+                if get_key_value:
+                    hidden_states, present = hidden_states
+                    presents.append(present)
+        
+        # reverting data format change [s b h] --> [b s h]
+        hidden_states = hidden_states.transpose(0, 1).contiguous()
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if get_key_value:
+            output = [output, presents]
+
+        return output
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
new file mode 100644
index 00000000..c309c4b9
--- /dev/null
+++ b/megatron/model/utils.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for models."""
+
+import math
+
+import torch
+
+from .transformer import LayerNorm
+
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def get_linear_layer(rows, columns, init_method):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
+    return gelu_impl(x)
+
+#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
+
+def get_params_for_weight_decay_optimization(module):
+    """Divide params into with-weight-decay and without-weight-decay groups.
+    Layernorms and baises will have no weight decay but the rest will.
+    """
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module_ in module.modules():
+        if isinstance(module_, LayerNorm):
+            no_weight_decay_params['params'].extend(
+                [p for p in list(module_._parameters.values())
+                 if p is not None])
+        else:
+            weight_decay_params['params'].extend(
+                [p for n, p in list(module_._parameters.items())
+                 if p is not None and n != 'bias'])
+            no_weight_decay_params['params'].extend(
+                [p for n, p in list(module_._parameters.items())
+                 if p is not None and n == 'bias'])
+
+    return weight_decay_params, no_weight_decay_params
diff --git a/megatron/module.py b/megatron/module.py
new file mode 100644
index 00000000..a78c228f
--- /dev/null
+++ b/megatron/module.py
@@ -0,0 +1,31 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron Module"""
+
+import torch
+
+
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extentions of torch Module."""
+
+    def __init__(self):
+        super(MegatronModule, self).__init__()
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(destination, prefix, keep_vars)
diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py
new file mode 100644
index 00000000..3fe88101
--- /dev/null
+++ b/megatron/mpu/__init__.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+
+from .data import broadcast_data
+
+from .grads import clip_grad_norm
+
+from .initialize import is_unitialized
+from .initialize import destroy_model_parallel
+from .initialize import get_data_parallel_group
+from .initialize import get_data_parallel_rank
+from .initialize import get_data_parallel_world_size
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank, set_model_parallel_rank
+from .initialize import get_model_parallel_src_rank
+from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
+from .initialize import get_topology
+from .initialize import get_pipe_parallel_group
+from .initialize import get_pipe_parallel_rank
+from .initialize import get_pipe_parallel_world_size
+from .initialize import get_io_parallel_group
+from .initialize import initialize_model_parallel
+from .initialize import model_parallel_is_initialized
+
+from .layers import LayerNorm
+from .layers import ColumnParallelLinear
+from .layers import RowParallelLinear
+from .layers import VocabParallelEmbedding
+
+from .mappings import copy_to_model_parallel_region
+from .mappings import gather_from_model_parallel_region
+from .mappings import reduce_from_model_parallel_region
+from .mappings import scatter_to_model_parallel_region
+
+from .random import checkpoint
+from .random import get_cuda_rng_tracker
+from .random import init_checkpointed_activations_memory_buffer
+from .random import model_parallel_cuda_manual_seed
+from .random import reset_checkpointed_activations_memory_buffer
+
+from .utils import divide
+from .utils import split_tensor_along_last_dim
diff --git a/megatron/mpu/cross_entropy.py b/megatron/mpu/cross_entropy.py
new file mode 100644
index 00000000..79ea83db
--- /dev/null
+++ b/megatron/mpu/cross_entropy.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=get_model_parallel_group())
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
+                                 device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_model_parallel_group())
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
+                                 device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (
+            1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/megatron/mpu/data.py b/megatron/mpu/data.py
new file mode 100644
index 00000000..84b0af6c
--- /dev/null
+++ b/megatron/mpu/data.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_src_rank
+
+
+_MAX_DATA_DIM = 4
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(),
+                                group=get_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
+                                                                          data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel,
+                                   device=torch.cuda.current_device(),
+                                   dtype=datatype)
+
+    # Boradcast
+    torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(),
+                                group=get_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/megatron/mpu/grads.py b/megatron/mpu/grads.py
new file mode 100644
index 00000000..f1e511ea
--- /dev/null
+++ b/megatron/mpu/grads.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import torch
+from torch._six import inf
+
+try:
+    from apex.multi_tensor_apply import multi_tensor_applier
+    import amp_C
+
+except Exception as e:
+    print('WARNING: APEX is not installed, multi_tensor_applier will not be available.')
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+
+
+def l2_grad_clipper(parameters, max_norm):
+    """Efficient L2 norm gradient clipping."""
+
+    overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda')
+    # Make sure we have an iterable.
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    # Filter parameters with gradients.
+    parameters_with_grads = list(filter(
+        lambda p: p.grad is not None, parameters))
+    # Filter parameters for norm calculations.
+    mp_rank_is_zero = (get_model_parallel_rank() == 0)
+    parameters_for_norm = list(filter(
+        lambda p: p.model_parallel or mp_rank_is_zero, parameters_with_grads))
+    # Calculate L2 norm.
+    norm, _ = multi_tensor_applier(
+        amp_C.multi_tensor_l2norm,
+        overflow_buf,
+        [parameters_for_norm],
+        False # no per-parameter norm
+    )
+    # Sum across all model parallel GPUs.
+    norm_2 = norm * norm
+    torch.distributed.all_reduce(norm_2,
+                                 op=torch.distributed.ReduceOp.SUM,
+                                 group=get_model_parallel_group())
+    total_norm = norm_2.item() ** 0.5
+    # Scale to get max_norm.
+    clip_coef = float(max_norm) / (total_norm + 1.0e-6)
+    grads = [p.grad for p in parameters_with_grads]
+    if clip_coef < 1.0:
+        multi_tensor_applier(
+            amp_C.multi_tensor_scale,
+            overflow_buf,
+            [grads, grads],
+            clip_coef)
+    return total_norm
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+        clip_coef = max_norm / (total_norm + 1e-6)
+        if clip_coef < 1:
+            for p in parameters:
+                p.grad.data.mul_(clip_coef)
+    #elif norm_type == 2:
+    #    total_norm = l2_grad_clipper(parameters, max_norm)
+
+    else:
+        total_norm = 0
+        for p in parameters:
+            if p.model_parallel or (get_model_parallel_rank() == 0):
+                param_norm = p.grad.data.norm(norm_type)
+                total_norm += param_norm.item() ** norm_type
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
+        clip_coef = max_norm / (total_norm + 1e-6)
+        if clip_coef < 1:
+            for p in parameters:
+                p.grad.data.mul_(clip_coef)
+    return total_norm
diff --git a/megatron/mpu/initialize.py b/megatron/mpu/initialize.py
new file mode 100644
index 00000000..a5957f95
--- /dev/null
+++ b/megatron/mpu/initialize.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+# Pipeline parallel group that the current rank belongs to.
+_PIPE_PARALLEL_GROUP = None
+
+# A group used to sync during the IO process. Usually this is data_parallel_group(),
+# but with pipeline parallelism it must also involve the last stage (which is not in the
+# DP group of rank 0)
+_IO_PARALLEL_GROUP = None 
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_WORLD_SIZE = None
+_MPU_RANK = None
+
+# Used to query 3D topology
+_MPU_TOPOLOGY = None
+
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+
+
+def initialize_model_parallel(model_parallel_size_, topology=None):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel grous as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel with size {}'.format(
+            model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = min(model_parallel_size_, world_size)
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+
+    global _MPU_TOPOLOGY
+    if topology:
+        _MPU_TOPOLOGY = topology
+
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    if topology:
+        for dp_group in topology.get_axis_comm_lists('data'):
+            group = torch.distributed.new_group(ranks=dp_group)
+            if rank == 0:
+                print(f'MPU DP:', dp_group)
+            if rank in dp_group:
+                _DATA_PARALLEL_GROUP = group
+    else:
+        for i in range(model_parallel_size):
+            ranks = range(i, world_size, model_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            if i == (rank % model_parallel_size):
+                _DATA_PARALLEL_GROUP = group
+
+    # Build pipeline parallel group
+    if topology is not None:
+        global _PIPE_PARALLEL_GROUP
+        for pp_group in topology.get_axis_comm_lists('pipe'):
+            group = torch.distributed.new_group(ranks=pp_group)
+            if rank == 0:
+                print(f'MPU PP:', pp_group)
+            if rank in pp_group:
+                _PIPE_PARALLEL_GROUP = group
+
+    # Build IO group
+    global _IO_PARALLEL_GROUP
+    if topology and topology.get_dim('pipe') > 1:
+        io_stages = [0, topology.get_dim('pipe') - 1]
+        io_group = []
+        for stage in io_stages:
+            io_group.extend(topology.filter_match(pipe=stage, model=0))
+        if rank == 0:
+            print(f'MPU IO:', io_group)
+        group = torch.distributed.new_group(ranks=io_group)
+        if rank in io_group:
+            _IO_PARALLEL_GROUP = group
+    else:
+        _IO_PARALLEL_GROUP = get_data_parallel_group()
+
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    if topology:
+        # Short circuit case without model parallelism.
+        # TODO: it would be nice  to avoid this branching case?
+        if model_parallel_size == 1:
+            for group_rank in range(world_size):
+                group = torch.distributed.new_group(ranks=[group_rank])
+                if rank == 0:
+                    print(f'MPU MP:', [group_rank])
+                if rank == group_rank:
+                    _MODEL_PARALLEL_GROUP = group
+            return
+
+        for mp_group in topology.get_axis_comm_lists('model'):
+            group = torch.distributed.new_group(ranks=mp_group)
+            if rank == 0:
+                print(f'MPU MP:', mp_group)
+            if rank in mp_group:
+                _MODEL_PARALLEL_GROUP = group
+
+    else:
+        for i in range(world_size // model_parallel_size):
+            ranks = range(i * model_parallel_size,
+                        (i + 1) * model_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            if i == (rank // model_parallel_size):
+                _MODEL_PARALLEL_GROUP = group
+    
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+def get_io_parallel_group():
+    """Get the IO parallel group the caller rank belongs to."""
+    assert _IO_PARALLEL_GROUP is not None, \
+        'IO parallel group is not initialized'
+    return _IO_PARALLEL_GROUP
+
+
+def set_model_parallel_world_size(world_size):
+    """Set the model parallel size"""
+    global _MPU_WORLD_SIZE
+    _MPU_WORLD_SIZE = world_size
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    global _MPU_WORLD_SIZE
+    if _MPU_WORLD_SIZE is not None:
+        return _MPU_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def set_model_parallel_rank(rank):
+    """Set model parallel rank."""
+    global _MPU_RANK
+    _MPU_RANK = rank
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    global _MPU_RANK
+    if _MPU_RANK is not None:
+        return _MPU_RANK
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zeor
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+def get_topology():
+    return _MPU_TOPOLOGY
+
+def get_pipe_parallel_group():
+    """Get the pipe parallel group the caller rank belongs to."""
+    assert _PIPE_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _PIPE_PARALLEL_GROUP
+
+def get_pipe_parallel_rank():
+    """Return my rank for the pipe parallel group."""
+    return torch.distributed.get_rank(group=get_pipe_parallel_group())
+
+def get_pipe_parallel_world_size():
+    """Return world size for the pipe parallel group."""
+    return torch.distributed.get_world_size(group=get_pipe_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
new file mode 100644
index 00000000..da30292a
--- /dev/null
+++ b/megatron/mpu/layers.py
@@ -0,0 +1,363 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+    # Try to use FusedLayerNorm from Apex - this will trigger an error.
+    _ = LayerNorm(8, eps=1e-5)
+
+except Exception as e:
+    print('WARNING: APEX is not installed, using torch.nn.LayerNorm '
+          'instead of apex.normalization.FusedLayerNorm!')
+    from torch.nn import LayerNorm
+
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .mappings import copy_to_model_parallel_region
+from .mappings import gather_from_model_parallel_region
+from .mappings import reduce_from_model_parallel_region
+from .mappings import scatter_to_model_parallel_region
+from .random import get_cuda_rng_tracker
+from .utils import divide
+from .utils import split_tensor_along_last_dim
+from .utils import VocabUtility
+from megatron import get_args
+
+def _initialize_affine_weight_gpu(weight, init_method,
+                                  partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    weight.model_parallel = True
+    weight.partition_dim = partition_dim
+    weight.partition_stride = stride
+    
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+
+
+def _initialize_affine_weight_cpu(weight, output_size, input_size,
+                                  per_partition_size, partition_dim,
+                                  init_method, stride=1,
+                                  return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    weight.model_parallel = True
+    weight.partition_dim = partition_dim
+    weight.partition_stride = stride
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size,
+                                dtype=torch.float,
+                                requires_grad=False)
+    init_method(master_weight)
+    args = get_args()
+    master_weight = master_weight.to(dtype=args.params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
+                              dim=partition_dim)
+    rank = get_model_parallel_rank()
+    world_size = get_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.model_parallel_size = get_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_model_parallel_rank(),
+                self.model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                dtype=args.params_dtype))
+            _initialize_affine_weight_cpu(
+                self.weight, self.num_embeddings, self.embedding_dim,
+                self.num_embeddings_per_partition, 0, init_method)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=0, stride=1)
+
+    def forward(self, input_):
+        if self.model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        if self.model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip 
+                       adding bias but instead return it.
+    """
+
+    def __init__(self, input_size, output_size, bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size_per_partition,
+                                                self.input_size,
+                                                dtype=args.params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight, self.output_size, self.input_size,
+                self.output_size_per_partition, 0, init_method,
+                stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size_per_partition, self.input_size,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=0, stride=stride)
+            
+        if bias:
+            if args.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition, dtype=args.params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=args.params_dtype))
+            self.bias.model_parallel = True
+            self.bias.partition_dim = 0
+            self.bias.stride = stride
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+
+        bias = self.bias if not self.skip_bias_add else None
+        output_parallel = F.linear(input_parallel, self.weight, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel 
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip 
+                       adding bias but instead return it.
+    """
+
+    def __init__(self, input_size, output_size, bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size,
+                                                self.input_size_per_partition,
+                                                dtype=args.params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight, self.output_size, self.input_size,
+                self.input_size_per_partition, 1, init_method,
+                stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size, self.input_size_per_partition,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=1, stride=stride)
+        if bias:
+            if args.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size,
+                                                  dtype=args.params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size, device=torch.cuda.current_device(),
+                    dtype=args.params_dtype))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
+
diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py
new file mode 100644
index 00000000..291c499a
--- /dev/null
+++ b/megatron/mpu/mappings.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group, get_model_parallel_world_size, get_model_parallel_rank
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_model_parallel_world_size()==1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_model_parallel_group())
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size==1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size==1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
diff --git a/megatron/mpu/random.py b/megatron/mpu/random.py
new file mode 100644
index 00000000..261f0a43
--- /dev/null
+++ b/megatron/mpu/random.py
@@ -0,0 +1,319 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from megatron import get_args
+from megatron.memory import allocate_mem_buff
+
+from .initialize import get_data_parallel_rank
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+# Whether apply model parallelsim to checkpointed hidden states.
+_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
+
+
+def init_checkpointed_activations_memory_buffer():
+    """Initializ the memory buffer for the checkpointed activations."""
+    args = get_args()
+
+    per_layer = args.batch_size * args.max_position_embeddings * \
+                args.hidden_size // args.model_parallel_size
+    assert args.num_layers % args.checkpoint_num_layers == 0, \
+        'number of layers is not divisible by checkpoint-num-layers'
+    num_checkpointer_layers = args.num_layers // args.checkpoint_num_layers
+    numel = per_layer * num_checkpointer_layers
+    dtype = torch.half
+    if not args.fp16:
+        dtype = torch.float
+        
+    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
+    assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \
+        'checkpointed activations memory buffer is already allocated.'
+    _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff(
+        'checkpointed activations', numel, dtype, track_usage=False)
+
+
+def reset_checkpointed_activations_memory_buffer():
+    """Reset the memory used for checkpointing."""
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset()
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+def split_tensor_into_1d_equal_chunks(tensor):
+    """Break a tensor into equal 1D chunks."""
+    data = tensor.view(-1)
+    partition_size = torch.numel(data) // get_model_parallel_world_size()
+    start_index = partition_size * get_model_parallel_rank()
+    end_index = start_index + partition_size
+    return data[start_index:end_index]
+
+
+def gather_split_1d_tensor(tensor):
+    """Opposite of above function, gather values from model parallel ranks."""
+    world_size = get_model_parallel_world_size()
+    numel = torch.numel(tensor)
+    numel_gathered = world_size * numel
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+    chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
+    torch.distributed.all_gather(chunks, tensor,
+                                 group=get_model_parallel_group())
+    return gathered
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-model-parallel regions.
+        model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    model_parallel_seed = offset + get_model_parallel_rank()
+    # Data parallel gets the original sedd.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel cuda seeds on global rank {}, '
+              'model parallel rank {}, and data parallel rank {} with '
+              'model parallel seed: {} and data parallel seed: {}'.format(
+                  torch.distributed.get_rank(), get_model_parallel_rank(),
+                  get_data_parallel_rank(), model_parallel_seed,
+                  data_parallel_seed), flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            ctx.input_0_shape = args[0].data.shape
+            args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
+            args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
+                args[0].data)
+            
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+            
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            inputs[0].data = gather_split_1d_tensor(inputs[0].data)
+            inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                      for inp in detached_inputs)
+        return (None,) + grads
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
diff --git a/megatron/mpu/tests/__init__.py b/megatron/mpu/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/megatron/mpu/tests/commons.py b/megatron/mpu/tests/commons.py
new file mode 100644
index 00000000..5e7a1867
--- /dev/null
+++ b/megatron/mpu/tests/commons.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import numpy
+import torch
+
+import mpu
+
+
+class IdentityLayer(torch.nn.Module):
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_rank', type=int, default=None,
+                        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
diff --git a/megatron/mpu/tests/test_cross_entropy.py b/megatron/mpu/tests/test_cross_entropy.py
new file mode 100644
index 00000000..41c22fc0
--- /dev/null
+++ b/megatron/mpu/tests/test_cross_entropy.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import set_random_seed
+from commons import IdentityLayer
+from commons import print_separator
+from commons import initialize_distributed
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+import mpu
+import torch.nn.functional as F
+import torch
+import random
+import sys
+sys.path.append("../..")
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size,
+                        logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
+                           target.view(-1),
+                           reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                      logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
+                                           vocab_size, logits_scale,
+                                           seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/megatron/mpu/tests/test_data.py b/megatron/mpu/tests/test_data.py
new file mode 100644
index 00000000..612d8419
--- /dev/null
+++ b/megatron/mpu/tests/test_data.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+from mpu import data as data_utils
+import mpu
+import torch
+import functools
+import operator
+import sys
+sys.path.append("../..")
+
+
+def test_boradcast_data(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing boradcast_data with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    key_size_t = {'key1': [7, 11],
+                  'key2': [8, 2, 1],
+                  'key3': [13],
+                  'key4': [5, 1, 2],
+                  'key5': [5, 12]}
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test test boradcast data')
+        test_boradcast_data(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/megatron/mpu/tests/test_initialize.py b/megatron/mpu/tests/test_initialize.py
new file mode 100644
index 00000000..2a023a35
--- /dev/null
+++ b/megatron/mpu/tests/test_initialize.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+import mpu
+import torch
+import sys
+sys.path.append("../..")
+
+
+def test_initialize_model_parallel(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            model_parallel_size))
+    model_parallel_size_ = min(model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = model_parallel_size_
+    rank = torch.distributed.get_rank() % model_parallel_size_
+    assert world_size == mpu.get_model_parallel_world_size()
+    assert rank == mpu.get_model_parallel_rank()
+    check(mpu.get_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // model_parallel_size_
+    rank = torch.distributed.get_rank() // model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_model_parallel_src_rank(model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_model_parallel_src_rank with size {} ...'.format(
+            model_parallel_size_))
+    model_parallel_size = min(model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
+    assert mpu.get_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_model_parallel_src_rank(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/megatron/mpu/tests/test_layers.py b/megatron/mpu/tests/test_layers.py
new file mode 100644
index 00000000..a7f2d9cd
--- /dev/null
+++ b/megatron/mpu/tests/test_layers.py
@@ -0,0 +1,530 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from mpu import layers
+from commons import set_random_seed
+from commons import print_separator
+from commons import initialize_distributed
+import mpu
+from torch.nn.parameter import Parameter
+import torch.nn.init as init
+import torch
+import random
+import sys
+sys.path.append("../..")
+
+
+def test_parallel_embedding(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // model_parallel_size,
+                                   1)[mpu.get_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // model_parallel_size,
+                                   0)[mpu.get_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(master_weight, output_size_coeff,
+                            dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(master_weight, input_size_coeff,
+                            dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+    def __init__(self, m, n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(dLdA, output_size_coeff,
+                          dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(dLdb, output_size_coeff,
+                          dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(dLdA, input_size_coeff,
+                          dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+    def __init__(self, m, n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+        torch.distributed.get_world_size()
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                    dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0  # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 = parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer = parallel_self_attention(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size, sequence_length):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+        torch.distributed.get_world_size()
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_initialize_affine_weight(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_column_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_row_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_self_attention(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_transformer_layer(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/megatron/mpu/tests/test_random.py b/megatron/mpu/tests/test_random.py
new file mode 100644
index 00000000..3ce7f8e1
--- /dev/null
+++ b/megatron/mpu/tests/test_random.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+import mpu
+import torch
+import sys
+sys.path.append("../..")
+
+
+def test_set_cuda_rng_state(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(1234)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(result_11.sub(target_11).abs().max(),
+                result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718 +
+                                             mpu.get_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
new file mode 100644
index 00000000..56ed1c76
--- /dev/null
+++ b/megatron/mpu/utils.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor, num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
diff --git a/megatron/package_info.py b/megatron/package_info.py
new file mode 100644
index 00000000..bd5decde
--- /dev/null
+++ b/megatron/package_info.py
@@ -0,0 +1,30 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MAJOR = 1
+MINOR = 1.5
+
+# Use the following formatting: (major, minor)
+VERSION = (MAJOR, MINOR)
+
+__version__ = '.'.join(map(str, VERSION))
+__package_name__ = 'megatron-lm'
+__contact_names__ = 'NVIDIA INC'
+__url__ = 'https://github.com/NVIDIA/Megatron-LM'
+__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
+__description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
+__license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
+__keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
+
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
new file mode 100644
index 00000000..0282b1af
--- /dev/null
+++ b/megatron/text_generation_utils.py
@@ -0,0 +1,412 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for generating text."""
+
+import copy
+import json
+import os
+import time
+
+import torch
+import torch.nn.functional as F
+
+from megatron import get_args
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.utils import get_ltor_masks_and_position_ids
+
+
+def get_batch(context_tokens):
+    """Generate batch from context tokens."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Move to GPU.
+    tokens = context_tokens.view(args.batch_size, -1).contiguous().cuda()
+    # Get the attention mask and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, attention_mask, position_ids
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ This function has been mostly taken from huggingface conversational
+     ai code at
+         https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+              conversational-ai-with-transfer-learning-2d818ac26313 """
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # Cconvert to 1D
+        sorted_logits, sorted_indices = torch.sort(
+            logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1),
+                                        dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] \
+            = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        for i in range(sorted_indices.size(0)):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+
+    return logits
+
+
+def generate_samples_input_from_file(model):
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Read the sample file and open the output file.
+    assert args.sample_input_file is not None, \
+        'sample input file is not provided.'
+    if mpu.get_model_parallel_rank() == 0:
+        fname = open(args.sample_input_file, "r")
+        all_raw_text = fname.readlines()
+        input_count = len(all_raw_text)
+        input_pos = 0
+        if args.sample_output_file is None:
+            sample_output_file = args.sample_input_file + ".out"
+            print('could not find `sample-output-file`, setting '
+                  'it to {}'.format(sample_output_file))
+        else:
+            sample_output_file = args.sample_output_file
+        fname_out = open(sample_output_file, "w+")
+
+    context_count = 0
+    model.eval()
+    with torch.no_grad():
+        while True:
+            torch.distributed.barrier(group=mpu.get_model_parallel_group())
+            terminate_runs = 0
+
+            if mpu.get_model_parallel_rank() == 0:
+                raw_text = all_raw_text[input_pos]
+                input_pos += 1
+                if input_pos == input_count:
+                    raw_text = "stop"
+
+                if "stop" in raw_text:
+                    terminate_runs = 1
+                else:
+                    context_tokens = tokenizer.tokenize(raw_text)
+                    context_length = len(context_tokens)
+
+                    if context_length >= (args.seq_length // 2):
+                        print("\nContext length", context_length,
+                              "\nPlease give smaller context (half of the "
+                              "sequence length)!", flush=True)
+                        continue
+            else:
+                context_tokens = tokenizer.tokenize("EMPTY TEXT")
+                context_length = len(context_tokens)
+
+            terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+            torch.distributed.broadcast(terminate_runs_tensor,
+                                        mpu.get_model_parallel_src_rank(),
+                                        group=mpu.get_model_parallel_group())
+            terminate_runs = terminate_runs_tensor[0].item()
+
+            if terminate_runs == 1:
+                return
+
+            token_stream = get_token_stream(model, [context_tokens])
+            for _, decode_tokens in enumerate(token_stream):
+                decode_tokens, _ = decode_tokens
+                decode_tokens = decode_tokens[0].cpu().numpy().tolist()
+
+            if mpu.get_model_parallel_rank() == 0:
+                os.system('clear')
+                print("\nContext:", raw_text, flush=True)
+                trim_decode_tokens = tokenizer.detokenize(
+                    decode_tokens)[len(raw_text):]
+                print("\nMegatron-LM:", trim_decode_tokens, flush=True)
+
+                fname_out.write("\nContext:")
+                fname_out.write(raw_text)
+                fname_out.write("\n\nMegatron-LM:")
+                fname_out.write(trim_decode_tokens)
+                fname_out.write("\n")
+
+            raw_text = None
+
+            torch.distributed.barrier(group=mpu.get_model_parallel_group())
+            context_count += 1
+
+
+def generate_samples_interactive(model, print_frequency=24):
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    context_count = 0
+    model.eval()
+    with torch.no_grad():
+        while True:
+            torch.distributed.barrier(group=mpu.get_model_parallel_group())
+            terminate_runs = 0
+
+            if mpu.get_model_parallel_rank() == 0:
+                os.system('clear')
+                raw_text = input("\nContext prompt (stop to exit) >>> ")
+                while not raw_text:
+                    print('Prompt should not be empty!')
+                    raw_text = input("\nContext prompt (stop to exit) >>> ")
+
+                if "stop" in raw_text:
+                    terminate_runs = 1
+                else:
+                    context_tokens = tokenizer.tokenize(raw_text)
+                    context_length = len(context_tokens)
+
+                    if context_length >= (args.seq_length // 2):
+                        print("\nContext length", context_length,
+                              "\nPlease give smaller context (half of the "
+                              "sequence length)!", flush=True)
+                        continue
+            else:
+                context_tokens = tokenizer.tokenize("EMPTY TEXT")
+                context_length = len(context_tokens)
+
+            terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+            torch.distributed.broadcast(terminate_runs_tensor,
+                                        mpu.get_model_parallel_src_rank(),
+                                        group=mpu.get_model_parallel_group())
+            terminate_runs = terminate_runs_tensor[0].item()
+
+            if terminate_runs == 1:
+                return
+
+            token_stream = get_token_stream(model, [context_tokens])
+            for counter, decode_tokens in enumerate(token_stream):
+                decode_tokens, _ = decode_tokens
+                decode_tokens = decode_tokens[0].cpu().numpy().tolist()
+
+                if mpu.get_model_parallel_rank() == 0 and \
+                   counter % print_frequency == 0:
+                    os.system('clear')
+                    print("\nContext:", raw_text, flush=True)
+                    trim_decode_tokens = tokenizer.detokenize(
+                        decode_tokens)[len(raw_text):]
+                    print("\nMegatron-LM:", trim_decode_tokens, flush=True)
+
+            if mpu.get_model_parallel_rank() == 0:
+                os.system('clear')
+                print("\nContext:", raw_text, flush=True)
+                trim_decode_tokens = tokenizer.detokenize(
+                    decode_tokens)[len(raw_text):]
+                print("\nMegatron-LM:", trim_decode_tokens, flush=True)
+
+            raw_text = None
+            torch.distributed.barrier(group=mpu.get_model_parallel_group())
+            context_count += 1
+
+            if mpu.get_model_parallel_rank() == 0:
+                input("\nPress any key to continue >>>")
+
+
+def generate_samples_unconditional(model):
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    num_samples = args.num_samples
+    context_tokens = [[tokenizer.eod]
+                      for _ in range(args.batch_size)]
+    ctr = 0
+    while True:
+        start_time = time.time()
+        for token_stream in get_token_stream(model,
+                                             copy.deepcopy(context_tokens)):
+            pass
+        if ctr % args.log_interval == 0:
+            print('Avg s/batch:',
+                  (time.time() - start_time) / min(args.log_interval, ctr + 1))
+            start_time = time.time()
+        length = len(token_stream)
+        token_batch = token_stream[0].cpu().numpy().tolist()
+        length_batch = token_stream[1].cpu().numpy().tolist()
+        for tokens, length in zip(token_batch, length_batch):
+            tokens = tokens[1:length - 1]
+            text = tokenizer.detokenize(tokens)
+            is_finished = length < args.seq_length - 1
+            datum = {'text': text, 'length': length - 1, 'finished': is_finished}
+            yield datum
+            ctr += 1
+            if ctr >= num_samples:
+                break
+        if ctr >= num_samples:
+            break
+
+
+def generate_and_write_samples_unconditional(model):
+
+    args = get_args()
+    assert args.genfile is not None
+    with open(args.genfile, 'w') as f:
+        for datum in generate_samples_unconditional(model):
+            f.write(json.dumps(datum) + '\n')
+
+
+def pad_batch(batch, pad_id, args):
+
+    context_lengths = []
+    for tokens in batch:
+        context_length = len(tokens)
+        if context_length < args.seq_length:
+            tokens.extend([pad_id] * (args.seq_length - context_length))
+        context_lengths.append(context_length)
+    return batch, context_lengths
+
+
+def get_token_stream(model, context_tokens):
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    context_tokens, context_lengths = pad_batch(context_tokens,
+                                                tokenizer.eod, args)
+
+    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+    context_length_tensor = torch.cuda.LongTensor(context_lengths)
+
+    torch.distributed.broadcast(context_length_tensor,
+                                mpu.get_model_parallel_src_rank(),
+                                group=mpu.get_model_parallel_group())
+    torch.distributed.broadcast(context_tokens_tensor,
+                                mpu.get_model_parallel_src_rank(),
+                                group=mpu.get_model_parallel_group())
+
+    context_length = context_length_tensor.min().item()
+    tokens, attention_mask, position_ids = get_batch(context_tokens_tensor)
+
+    batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor,
+                                                 context_length_tensor,
+                                                 attention_mask, position_ids)
+    for tokens, lengths in batch_token_iterator:
+        context_length += 1
+        yield tokens[:, :context_length], lengths
+
+
+def switch(val1, val2, boolean):
+
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+def sample_sequence_batch(model, context_tokens, context_lengths,
+                          attention_mask, position_ids,
+                          maxlen=None, type_ids=None):
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    model.eval()
+    with torch.no_grad():
+        context_length = context_lengths.min().item()
+        eos_id = tokenizer.eod
+
+        counter = 0
+        org_context_length = context_length
+
+        layer_past = None
+        batch_size = context_tokens.size(0)
+        is_done = torch.zeros([batch_size]).byte().cuda()
+        tokens = context_tokens
+        if maxlen is None:
+            maxlen = args.seq_length - 1
+            if maxlen > (org_context_length + args.out_seq_length):
+                maxlen = org_context_length + args.out_seq_length
+
+        lengths = torch.ones([batch_size]).long().cuda() * maxlen
+
+        while context_length <= (maxlen):
+
+            if args.recompute:
+                logits = model(tokens,
+                               position_ids,
+                               attention_mask,
+                               tokentype_ids=type_ids,
+                               forward_method_parallel_output=False)
+                logits = logits[:, context_length - 1, :]
+            else:
+                types2use = None
+                if counter == 0:
+                    tokens2use = tokens[:, :context_length]
+                    positions2use = position_ids[:, :context_length]
+                    if type_ids is not None:
+                        types2use = type_ids[:, :context_length]
+                else:
+                    tokens2use = tokens[:, context_length - 1].view(
+                        batch_size, -1)
+                    positions2use = position_ids[:, context_length - 1].view(
+                        batch_size, -1)
+                    if type_ids is not None:
+                        types2use = type_ids[:, context_length - 1].view(
+                            batch_size, -1)
+                logits, layer_past = model(tokens2use,
+                                           positions2use,
+                                           attention_mask,
+                                           layer_past=layer_past,
+                                           get_key_value=True,
+                                           tokentype_ids=types2use,
+                                           forward_method_parallel_output=False)
+                logits = logits[:, -1].view(batch_size, -1).contiguous()
+
+            if args.greedy:
+                prev = torch.argmax(logits, dim=-1).view(-1)
+            else:
+                logits = logits.float()
+                logits /= args.temperature
+                logits = top_k_logits(logits, top_k=args.top_k,
+                                      top_p=args.top_p)
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1).view(-1)
+
+            print_logits = []
+            for p in prev:
+                print_logits.append([logits[i, p].item()
+                                     for i in range(batch_size)])
+            started = context_lengths <= context_length
+            tokens[:, context_length] = switch(
+                tokens[:, context_length].view(-1), prev, started)
+            context_length += 1
+            counter += 1
+
+            done_token = (prev == eos_id).byte() & started.byte()
+            just_finished = (done_token & ~is_done).bool()
+            lengths[just_finished.view(-1)] = context_length
+            is_done = is_done | done_token
+            done = torch.all(is_done)
+
+            yield tokens, lengths
+            if done:
+                break
diff --git a/model/__init__.py b/megatron/tokenizer/__init__.py
old mode 100755
new mode 100644
similarity index 83%
rename from model/__init__.py
rename to megatron/tokenizer/__init__.py
index 62d5b216..311f2fdc
--- a/model/__init__.py
+++ b/megatron/tokenizer/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,5 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .distributed import *
-from .model import *
+
+from .tokenizer import build_tokenizer
diff --git a/megatron/tokenizer/bert_tokenization.py b/megatron/tokenizer/bert_tokenization.py
new file mode 100644
index 00000000..a3aa6d90
--- /dev/null
+++ b/megatron/tokenizer/bert_tokenization.py
@@ -0,0 +1,402 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+import six
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+
+    if not init_checkpoint:
+        return
+
+    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+    if m is None:
+        return
+
+    model_name = m.group(1)
+
+    lower_models = [
+        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    ]
+
+    cased_models = [
+        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+        "multi_cased_L-12_H-768_A-12"
+    ]
+
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = "False"
+        case_name = "lowercased"
+        opposite_flag = "True"
+
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = "True"
+        case_name = "cased"
+        opposite_flag = "False"
+
+    if is_bad_config:
+        raise ValueError(
+            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+            "However, `%s` seems to be a %s model, so you "
+            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+            "how the model was pre-training. If this error is wrong, please "
+            "just comment out this check." % (actual_flag, init_checkpoint,
+                                              model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+    def vocab_size(self):
+        return len(self.vocab)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat in ("Cc", "Cf"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/megatron/tokenizer/gpt2_tokenization.py b/megatron/tokenizer/gpt2_tokenization.py
new file mode 100644
index 00000000..3f37e449
--- /dev/null
+++ b/megatron/tokenizer/gpt2_tokenization.py
@@ -0,0 +1,321 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for OpenAI GPT."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE
+    # tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
+        list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            from .file_utils import cached_path
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except BaseException:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(
+                    len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
new file mode 100644
index 00000000..9c4f9d00
--- /dev/null
+++ b/megatron/tokenizer/tokenizer.py
@@ -0,0 +1,220 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron tokenizers."""
+
+from abc import ABC
+from abc import abstractmethod
+
+from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .gpt2_tokenization import GPT2Tokenizer
+
+
+def build_tokenizer(args):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type),
+              flush=True)
+
+    # Select and instantiate the tokenizer.
+    assert args.vocab_file is not None
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                            lower_case=True)
+    elif args.tokenizer_type == 'BertWordPieceCase':
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                            lower_case=False)
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    else:
+        raise NotImplementedError('{} tokenizer is not '
+                                  'implemented.'.format(args.tokenizer_type))
+
+    # Add vocab size.
+    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
+                                                      args)
+
+    return tokenizer
+
+
+def _vocab_size_with_padding(orig_vocab_size, args):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * \
+        args.model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    if args.rank == 0:
+        print(' > padded vocab (size: {}) with {} dummy tokens '
+              '(new size: {})'.format(
+                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+    return after
+
+
+class AbstractTokenizer(ABC):
+    """Abstract class for tokenizer."""
+
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        pass
+
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
+
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+
+    def detokenize(self, token_ids):
+        raise NotImplementedError('detokenizer is not implemented for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def cls(self):
+        raise NotImplementedError('CLS is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def sep(self):
+        raise NotImplementedError('SEP is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def pad(self):
+        raise NotImplementedError('PAD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def eod(self):
+        raise NotImplementedError('EOD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def mask(self):
+        raise NotImplementedError('MASK is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+
+class _BertWordPieceTokenizer(AbstractTokenizer):
+    """Original BERT wordpiece tokenizer."""
+
+    def __init__(self, vocab_file, lower_case=True):
+        if lower_case:
+            name = 'BERT Lower Case'
+        else:
+            name = 'BERT Upper Case'
+        super().__init__(name)
+        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
+        self.cls_id = self.tokenizer.vocab['[CLS]']
+        self.sep_id = self.tokenizer.vocab['[SEP]']
+        self.pad_id = self.tokenizer.vocab['[PAD]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.vocab_size()
+
+    @property
+    def vocab(self):
+        return self.tokenizer.vocab
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.inv_vocab
+
+    def tokenize(self, text):
+        text_tokens = self.tokenizer.tokenize(text)
+        return self.tokenizer.convert_tokens_to_ids(text_tokens)
+
+    def decode_token_ids(self, token_ids):
+        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
+        exclude_list = ['[PAD]', '[CLS]']
+        non_pads = [t for t in tokens if t not in exclude_list]
+
+        result = ""
+        for s in non_pads:
+            if s.startswith("##"):
+                result += s[2:]
+            else:
+                result += " " + s
+
+        return result
+
+    @property
+    def cls(self):
+        return self.cls_id
+
+    @property
+    def sep(self):
+        return self.sep_id
+
+    @property
+    def pad(self):
+        return self.pad_id
+
+    @property
+    def mask(self):
+        return self.mask_id
+
+
+class _GPT2BPETokenizer(AbstractTokenizer):
+    """Original GPT2 BPE tokenizer."""
+
+    def __init__(self, vocab_file, merge_file):
+        name = 'GPT2 BPE'
+        super().__init__(name)
+
+        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
+                                       special_tokens=[], max_len=None)
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
diff --git a/megatron/training.py b/megatron/training.py
new file mode 100644
index 00000000..66a95ba8
--- /dev/null
+++ b/megatron/training.py
@@ -0,0 +1,695 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain utilities."""
+
+from datetime import datetime
+import math
+import sys
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from apex.optimizers import FusedAdam as Adam
+
+from megatron import get_args
+from megatron import get_timers
+from megatron import get_tensorboard_writer
+from megatron import mpu
+from megatron import print_rank_0
+from megatron.checkpointing import load_checkpoint
+from megatron.checkpointing import save_checkpoint
+from megatron.fp16 import FP16_Module
+from megatron.fp16 import FP16_Optimizer
+from megatron.initialize import initialize_megatron
+from megatron.learning_rates import AnnealingLR
+from megatron.model import DistributedDataParallel as LocalDDP
+from megatron.model import get_params_for_weight_decay_optimization
+from megatron.model.realm_model import ICTBertModel
+from megatron.utils import check_adlr_autoresume_termination
+from megatron.utils import make_data_loader
+from megatron.utils import report_memory
+
+import deepspeed
+
+
+def pretrain(train_valid_test_dataset_provider, model_provider,
+             forward_step_func, extra_args_provider=None, args_defaults={}):
+    """Main training program.
+
+    This function will run the followings in the order provided:
+        1) initialize Megatron.
+        2) setup model, optimizer and lr schedule using the model_provider.
+        3) call train_val_test_data_provider to get train/val/test datasets.
+        4) train the modle using the forward_step_func.
+
+    Arguments:
+        train_valid_test_dataset_provider: a function that takes the size of
+            train/valid/test dataset and returns `train, valid, test` datasets.
+        model_provider: a function that returns a vanilla version of the
+            model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
+        forward_step_func: a function that takes a `data iterator` and `model`,
+            and returns a `loss` scalar with a dictionary with key:values being
+            the info we would like to monitor during training, for example
+            `lm-loss: value`. We also require that this function add
+            `batch generator` to the timers class.
+        extra_args_provider: a function that takes a parser and adds arguments
+            to it. It is used for programs to add their own arguments.
+        args_defaults: a dictionary from argument-name to argument-value. It
+            to set already parse arguments.
+    """
+
+    # Initalize and get arguments, timers, and Tensorboard writer.
+    initialize_megatron(extra_args_provider=extra_args_provider,
+                        args_defaults=args_defaults)
+
+    args = get_args()
+    timers = get_timers()
+
+    # Model, optimizer, and learning rate.
+    timers('model and optimizer').start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    timers('model and optimizer').stop()
+
+    # Data stuff.
+    timers('train/valid/test data iterators').start()
+    train_data_iterator, valid_data_iterator, test_data_iterator \
+        = build_train_valid_test_data_iterators(
+            train_valid_test_dataset_provider)
+    timers('train/valid/test data iterators').stop()
+
+    # Print setup timing.
+    print_rank_0('done with setups ...')
+    timers.log(['model and optimizer', 'train/valid/test data iterators'])
+    print_rank_0('training ...')
+
+    iteration = 0
+    if args.do_train and args.train_iters > 0:
+        iteration = train(forward_step_func,
+                          model, optimizer, lr_scheduler,
+                          train_data_iterator, valid_data_iterator)
+
+    if args.do_valid:
+        prefix = 'the end of training for val data'
+        evaluate_and_print_results(prefix, forward_step_func,
+                                   valid_data_iterator, model,
+                                   iteration, False)
+
+    if args.save and iteration != 0:
+        save_checkpoint(iteration, model, optimizer, lr_scheduler)
+
+    if args.do_test:
+        # Run on test data.
+        prefix = 'the end of training for test data'
+        evaluate_and_print_results(prefix, forward_step_func,
+                                   test_data_iterator, model,
+                                   0, True)
+
+
+def get_model(model_provider_func):
+    """Build the model."""
+    args = get_args()
+
+    # Build model on cpu.
+    model = model_provider_func()
+
+    # Print number of parameters.
+    if mpu.get_data_parallel_rank() == 0:
+        print(' > number of parameters on model parallel rank {}: {}'.format(
+            mpu.get_model_parallel_rank(),
+            sum([p.nelement() for p in model.parameters()])), flush=True)
+
+    if args.deepspeed:
+        # DeepSpeed handles CUDA, FP16, and DDP components.
+        return model
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16:
+        model = FP16_Module(model)
+
+    # Wrap model for distributed training."""
+    if args.DDP_impl == 'torch':
+        i = torch.cuda.current_device()
+        model = torchDDP(model, device_ids=[i], output_device=i,
+                         process_group=mpu.get_data_parallel_group())
+        return model
+    if args.DDP_impl == 'local':
+        model = LocalDDP(model)
+        return model
+
+    raise NotImplementedError('Unknown DDP implementation specified: {}. '
+                              'Exiting.'.format(args.DDP_impl))
+
+
+def get_optimizer(model):
+    """Set up the optimizer."""
+    args = get_args()
+
+    # Build parameter groups (weight decay and non-decay).
+    while isinstance(model, (torchDDP, LocalDDP, FP16_Module)):
+        model = model.module
+    param_groups = get_params_for_weight_decay_optimization(model)
+
+    # Add model parallel attribute if it is not set.
+    for param_group in param_groups:
+        for param in param_group['params']:
+            if not hasattr(param, 'model_parallel'):
+                param.model_parallel = False
+
+    if args.cpu_optimizer:
+        if args.cpu_torch_adam:
+            cpu_adam_optimizer = torch.optim.Adam
+        else:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+            cpu_adam_optimizer = DeepSpeedCPUAdam
+        optimizer = cpu_adam_optimizer(param_groups,
+                                       lr=args.lr,
+                                       weight_decay=args.weight_decay)
+    else:
+        # Use Adam.
+        optimizer = Adam(param_groups,
+                         lr=args.lr,
+                         weight_decay=args.weight_decay,
+                         betas=(args.adam_beta1, args.adam_beta2),
+                         eps=args.adam_eps)
+
+    if args.deepspeed:
+        # fp16 wrapper is not required for DeepSpeed.
+        return optimizer
+
+    # Wrap into fp16 optimizer.
+    if args.fp16:
+        optimizer = FP16_Optimizer(optimizer,
+                                   static_loss_scale=args.loss_scale,
+                                   dynamic_loss_scale=args.dynamic_loss_scale,
+                                   dynamic_loss_args={
+                                       'scale_window': args.loss_scale_window,
+                                       'min_scale': args.min_scale,
+                                       'delayed_shift': args.hysteresis})
+
+    return optimizer
+
+
+def get_learning_rate_scheduler(optimizer):
+    """Build the learning rate scheduler."""
+    args = get_args()
+
+    # Add linear learning rate scheduler.
+    if args.lr_decay_iters is not None:
+        num_iters = args.lr_decay_iters
+    else:
+        num_iters = args.train_iters
+    num_iters = max(1, num_iters)
+    init_step = 0
+    warmup_iter = args.warmup * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=args.lr,
+        warmup_iter=warmup_iter,
+        total_iters=num_iters,
+        decay_style=args.lr_decay_style,
+        last_iter=init_step,
+        min_lr=args.min_lr,
+        use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
+        override_lr_scheduler=args.override_lr_scheduler)
+
+    return lr_scheduler
+
+
+def setup_model_and_optimizer(model_provider_func):
+    """Setup model and optimizer."""
+    args = get_args()
+
+    model = get_model(model_provider_func)
+    optimizer = get_optimizer(model)
+    lr_scheduler = get_learning_rate_scheduler(optimizer)
+
+    if args.deepspeed:
+        print_rank_0("DeepSpeed is enabled.")
+
+        model, optimizer, _, lr_scheduler = deepspeed.initialize(
+            model=model,
+            optimizer=optimizer,
+            args=args,
+            lr_scheduler=lr_scheduler,
+            mpu=mpu if args.pipe_parallel_size == 0 else None,
+            dist_init_required=False)
+
+        if args.pipe_parallel_size > 0:
+            model.set_batch_fn(model.module._megatron_batch_fn)
+
+    if args.load is not None:
+        args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
+    else:
+        args.iteration = 0
+
+    # get model without FP16 and/or TorchDDP wrappers
+    unwrapped_model = model
+    while hasattr(unwrapped_model, 'module'):
+        unwrapped_model = unwrapped_model.module
+
+    if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'):
+        print("Initializing ICT from pretrained BERT model", flush=True)
+        unwrapped_model.init_state_dict_from_bert()
+
+    return model, optimizer, lr_scheduler
+
+
+def backward_step(optimizer, model, loss):
+    """Backward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Backward pass.
+    timers('backward-backward').start()
+    if args.deepspeed:
+        model.backward(loss)
+    else:
+        optimizer.zero_grad(set_grads_to_None=True)
+        if args.fp16:
+            optimizer.backward(loss, update_master_grads=False)
+        else:
+            loss.backward()
+    timers('backward-backward').stop()
+
+    if args.deepspeed:
+        # DeepSpeed backward propagation already addressed all reduce communication.
+        # Reset the timer to avoid breaking timer logs below.
+        timers('backward-allreduce').reset()
+    else:
+        # All-reduce if needed.
+        if args.DDP_impl == 'local':
+            timers('backward-allreduce').start()
+            model.allreduce_params(reduce_after=False,
+                                   fp32_allreduce=args.fp32_allreduce)
+            timers('backward-allreduce').stop()
+
+    if not args.deepspeed:
+        # Update master gradients.
+        timers('backward-master-grad').start()
+        if args.fp16:
+            optimizer.update_master_grads()
+        timers('backward-master-grad').stop()
+
+        # Clipping gradients helps prevent the exploding gradient.
+        timers('backward-clip-grad').start()
+        if args.clip_grad > 0:
+            if not args.fp16:
+                mpu.clip_grad_norm(model.parameters(), args.clip_grad)
+            else:
+                optimizer.clip_master_grads(args.clip_grad)
+        timers('backward-clip-grad').stop()
+
+
+def train_step(forward_step_func, data_iterator,
+               model, optimizer, lr_scheduler):
+    """Single training step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Pipeline parallelism schedules forward/backward/step
+    if args.pipe_parallel_size > 0:
+        return train_step_pipe(model, data_iterator)
+
+    # Forward model for one step.
+    timers('forward').start()
+    loss, loss_reduced = forward_step_func(data_iterator, model)
+    timers('forward').stop()
+
+    # Calculate gradients, reduce across processes, and clip.
+    timers('backward').start()
+    backward_step(optimizer, model, loss)
+    timers('backward').stop()
+
+    # Update parameters.
+    skipped_iter = 0
+    timers('optimizer').start()
+    if args.deepspeed:
+        model.step()
+    else:
+        optimizer.step()
+        # Update learning rate.
+        if not (args.fp16 and optimizer.overflow):
+            lr_scheduler.step()
+        else:
+            skipped_iter = 1
+    timers('optimizer').stop()
+
+    return loss_reduced, skipped_iter
+
+def train_step_pipe(model, data_iterator):
+    """Single training step with DeepSpeed's pipeline parallel engine. """
+    args = get_args()
+    timers = get_timers()
+
+    assert args.deepspeed
+    loss = model.train_batch(data_iter=data_iterator)
+    loss_dict = {'lm loss': loss}
+    if args.fp16 and model.optimizer.overflow:
+        skipped_iter = 1
+    else:
+        skipped_iter = 0
+
+    # Don't break Megatron's timers because we changed code paths.
+    for t in ['forward', 'backward', 'allreduce', 'optimizer', 'batch generator',
+              'data loader']:
+        timers(t).reset()
+    return loss_dict, skipped_iter
+
+
+
+def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
+                 loss_scale, report_memory_flag, skipped_iter):
+    """Log training information such as losses, timing, ...."""
+    args = get_args()
+    timers = get_timers()
+    writer = get_tensorboard_writer()
+
+    # Update losses.
+    skipped_iters_key = 'skipped iterations'
+    total_loss_dict[skipped_iters_key] = total_loss_dict.get(
+        skipped_iters_key, 0) + skipped_iter
+    got_nan_key = 'got nan'
+
+    got_nan = False
+    for key in loss_dict:
+        if not skipped_iter:
+            total_loss_dict[key] = total_loss_dict.get(key, 0.) + loss_dict[key]
+        else:
+            value = loss_dict[key].float().sum().item()
+            is_nan = value == float('inf') or \
+                     value == -float('inf') or \
+                     value != value
+            got_nan = got_nan or is_nan
+
+    total_loss_dict[got_nan_key] = total_loss_dict.get(
+        got_nan_key, 0) + int(got_nan)
+
+    # Logging.
+    timers_to_log = []
+
+    def add_to_logging(name):
+        if name in timers.timers:
+            timers_to_log.append(name)
+    add_to_logging('forward')
+    add_to_logging('backward')
+    add_to_logging('backward-backward')
+    add_to_logging('backward-allreduce')
+    add_to_logging('backward-master-grad')
+    add_to_logging('backward-clip-grad')
+    add_to_logging('optimizer')
+    add_to_logging('batch generator')
+
+    # Tensorboard values.
+    if writer and torch.distributed.get_rank() == 0:
+        writer.add_scalar('learning_rate', learning_rate, iteration)
+        for key in loss_dict:
+            writer.add_scalar(key, loss_dict[key], iteration)
+        if args.fp16:
+            writer.add_scalar('loss_scale', loss_scale, iteration)
+        normalizer = iteration % args.log_interval
+        if normalizer == 0:
+            normalizer = args.log_interval
+        timers.write(timers_to_log, writer, iteration,
+                     normalizer=normalizer)
+
+    if iteration % args.log_interval == 0:
+        elapsed_time = timers('interval time').elapsed()
+        if writer and torch.distributed.get_rank() == 0:
+            writer.add_scalar('iteration_time',
+                              elapsed_time / args.log_interval, iteration)
+        log_string = ' iteration {:8d}/{:8d} |'.format(iteration,
+                                                       args.train_iters)
+        log_string += ' elapsed time per iteration (ms): {:.1f} |'.format(
+            elapsed_time * 1000.0 / args.log_interval)
+        log_string += ' learning rate: {:.3E} |'.format(learning_rate)
+        num_iterations = max(
+            1, args.log_interval - total_loss_dict[skipped_iters_key])
+        for key in total_loss_dict:
+            if key not in [skipped_iters_key, got_nan_key]:
+                avg = total_loss_dict[key].item() / float(num_iterations)
+                log_string += ' {}: {:.6E} |'.format(key, avg)
+                total_loss_dict[key] = 0.0
+        if args.fp16:
+            log_string += ' loss scale: {:.1f} |'.format(loss_scale)
+        log_string += ' number of skipped iterations: {:3d} |'.format(
+            total_loss_dict[skipped_iters_key])
+        log_string += ' number of nan iterations: {:3d} |'.format(
+            total_loss_dict[got_nan_key])
+        total_loss_dict[skipped_iters_key] = 0
+        total_loss_dict[got_nan_key] = 0
+        print_rank_0(log_string)
+        if report_memory_flag:
+            report_memory('after {} iterations'.format(iteration))
+            report_memory_flag = False
+        timers.log(timers_to_log, normalizer=args.log_interval)
+
+    return report_memory_flag
+
+
+def train(forward_step_func, model, optimizer, lr_scheduler,
+          train_data_iterator, valid_data_iterator):
+    """Train the model function."""
+    args = get_args()
+    timers = get_timers()
+
+    # Turn on training mode which enables dropout.
+    model.train()
+
+    # Tracking loss.
+    total_loss_dict = {}
+
+    # Iterations.
+    iteration = args.iteration
+
+    timers('interval time').start()
+    report_memory_flag = True
+    while iteration < args.train_iters:
+        loss_dict, skipped_iter = train_step(forward_step_func,
+                                             train_data_iterator,
+                                             model,
+                                             optimizer,
+                                             lr_scheduler)
+        iteration += 1
+
+        # Logging.
+        loss_scale = None
+        if args.fp16:
+            loss_scale = optimizer.cur_scale if args.deepspeed else optimizer.loss_scale
+        report_memory_flag = training_log(loss_dict, total_loss_dict,
+                                          optimizer.param_groups[0]['lr'],
+                                          iteration, loss_scale,
+                                          report_memory_flag, skipped_iter)
+
+        # Autoresume
+        if args.adlr_autoresume and \
+           (iteration % args.adlr_autoresume_interval == 0):
+            check_adlr_autoresume_termination(iteration, model, optimizer,
+                                              lr_scheduler)
+
+        # Checkpointing
+        if args.save and args.save_interval and \
+           iteration % args.save_interval == 0:
+            save_checkpoint(iteration, model, optimizer, lr_scheduler)
+
+        # Evaluation
+        if args.eval_interval and iteration % args.eval_interval == 0 and \
+           args.do_valid:
+            prefix = 'iteration {}'.format(iteration)
+            evaluate_and_print_results(prefix, forward_step_func,
+                                       valid_data_iterator, model,
+                                       iteration, False)
+
+        if args.exit_interval and iteration % args.exit_interval == 0:
+            torch.distributed.barrier()
+            time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            rank = torch.distributed.get_rank()
+            print_rank_0('rank: {} | time: {} | exiting the program at '
+                         'iteration {}'.format(rank, time_str, iteration))
+            sys.exit()
+
+    return iteration
+
+
+def evaluate(forward_step_func, data_iterator, model, verbose=False):
+    """Evaluation."""
+    args = get_args()
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_loss_dict = {}
+
+    with torch.no_grad():
+        iteration = 0
+        while iteration < args.eval_iters:
+            iteration += 1
+            if verbose and iteration % args.log_interval == 0:
+                print_rank_0('Evaluating iter {}/{}'.format(iteration,
+                                                            args.eval_iters))
+            # Forward evaluation.
+            _, loss_dict = forward_step_func(data_iterator, model)
+
+            # When contiguous memory optimizations are enabled, the buffers
+            # allocated by the optimizations are deallocated during backward pass
+            # in the absence of backward pass the buffers should be reset after each
+            # forward pass
+            if args.deepspeed and args.deepspeed_activation_checkpointing:
+                deepspeed.checkpointing.reset()
+
+            # Reduce across processes.
+            for key in loss_dict:
+                total_loss_dict[key] = total_loss_dict.get(key, 0.) + \
+                    loss_dict[key]
+    # Move model back to the train mode.
+    model.train()
+
+    for key in total_loss_dict:
+        total_loss_dict[key] /= args.eval_iters
+
+    return total_loss_dict
+
+
+def evaluate_and_print_results(prefix, forward_step_func,
+                               data_iterator, model,
+                               iteration, verbose=False):
+    """Helper function to evaluate and dump results on screen."""
+    writer = get_tensorboard_writer()
+
+    # Pipeline parallelism needs eval_batch() instead of a simple forward().
+    args = get_args()
+    if args.pipe_parallel_size > 0:
+        def _eval_helper(data_iter, pipe_model):
+            loss = model.eval_batch(data_iter)
+            return None, {'lm loss' : loss}
+        forward_step_func = _eval_helper
+
+    total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose)
+    string = ' validation loss at {} | '.format(prefix)
+    for key in total_loss_dict:
+        string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
+        ppl = math.exp(min(20, total_loss_dict[key].item()))
+        string += '{} PPL: {:.6E} | '.format(key, ppl)
+        if writer and torch.distributed.get_rank() == 0:
+            writer.add_scalar('{} value'.format(key),
+                              total_loss_dict[key].item(),
+                              iteration)
+            writer.add_scalar('{} ppl'.format(key), ppl, iteration)
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+
+
+def build_train_valid_test_data_iterators(
+        build_train_valid_test_datasets_provider):
+    """XXX"""
+    args = get_args()
+
+    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+
+    print_rank_0('> building train, validation, and test datasets ...')
+
+    # Ensure only the first/last pipeline stages have data loaders
+    if args.pipe_parallel_size > 0:
+        is_first_stage = mpu.get_pipe_parallel_rank() == 0
+        is_last_stage = mpu.get_pipe_parallel_rank() == mpu.get_pipe_parallel_world_size() - 1
+        pipe_load = is_first_stage or is_last_stage
+    else:
+        pipe_load = True
+
+    # Data loader only on rank 0 of each model parallel group.
+    if mpu.get_model_parallel_rank() == 0 and pipe_load:
+        # Rank, size, and global batch size.
+        data_parallel_size = mpu.get_data_parallel_world_size()
+        global_batch_size = args.batch_size * data_parallel_size * args.gas
+
+        # Number of train/valid/test samples.
+        train_iters = args.train_iters
+        eval_iters = (train_iters // args.eval_interval + 1) * args.eval_iters
+        test_iters = args.eval_iters
+        train_val_test_num_samples = [train_iters * global_batch_size,
+                                      eval_iters * global_batch_size,
+                                      test_iters * global_batch_size]
+        print_rank_0(' > datasets target sizes (minimum size):')
+        print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
+        print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
+        print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
+
+        # Build the datasets.
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider(
+            train_val_test_num_samples)
+
+        # Build dataloders.
+        train_dataloader = make_data_loader(train_ds)
+        valid_dataloader = make_data_loader(valid_ds)
+        test_dataloader = make_data_loader(test_ds)
+
+        # Flags to know if we need to do training/validation/testing.
+        do_train = train_dataloader is not None and args.train_iters > 0
+        do_valid = valid_dataloader is not None and args.eval_iters > 0
+        do_test = test_dataloader is not None and args.eval_iters > 0
+        # Need to broadcast num_tokens and num_type_tokens.
+        flags = torch.cuda.LongTensor(
+            [int(do_train), int(do_valid), int(do_test)])
+    else:
+        flags = torch.cuda.LongTensor([0, 0, 0])
+
+    # Broadcast num tokens.
+    if args.pipe_parallel_size > 0:
+        # Only first/last pipeline stages have data loaders, so pipeline parallelism should
+        # broadcast globally instead of just the model parallel group.
+        torch.distributed.broadcast(flags, src=0)
+    else:
+        torch.distributed.broadcast(flags,
+                                    mpu.get_model_parallel_src_rank(),
+                                    group=mpu.get_model_parallel_group())
+    args.do_train = flags[0].item()
+    args.do_valid = flags[1].item()
+    args.do_test = flags[2].item()
+
+    # Shift the start iterations.
+    if train_dataloader is not None:
+        train_dataloader.batch_sampler.start_iter = args.iteration % \
+            len(train_dataloader)
+        print_rank_0('setting training data start iteration to {}'.
+                     format(train_dataloader.batch_sampler.start_iter))
+    if valid_dataloader is not None:
+        start_iter_val = (args.iteration // args.eval_interval) * \
+            args.eval_iters
+        valid_dataloader.batch_sampler.start_iter = start_iter_val % \
+            len(valid_dataloader)
+        print_rank_0('setting validation data start iteration to {}'.
+                     format(valid_dataloader.batch_sampler.start_iter))
+
+    # Build iterators.
+    if train_dataloader is not None:
+        train_data_iterator = iter(train_dataloader)
+    else:
+        train_data_iterator = None
+
+    if valid_dataloader is not None:
+        valid_data_iterator = iter(valid_dataloader)
+    else:
+        valid_data_iterator = None
+
+    if test_dataloader is not None:
+        test_data_iterator = iter(test_dataloader)
+    else:
+        test_data_iterator = None
+
+    return train_data_iterator, valid_data_iterator, test_data_iterator
diff --git a/megatron/utils.py b/megatron/utils.py
new file mode 100644
index 00000000..24d832d0
--- /dev/null
+++ b/megatron/utils.py
@@ -0,0 +1,177 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""General utilities."""
+
+import sys
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_adlr_autoresume
+from megatron import mpu
+from megatron.checkpointing import save_checkpoint
+from megatron.data.samplers import DistributedBatchSampler
+from megatron.fp16 import FP16_Optimizer
+
+
+def reduce_losses(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    reduced_losses = torch.cat(
+        [loss.clone().detach().view(1) for loss in losses])
+    torch.distributed.all_reduce(reduced_losses)
+    reduced_losses = reduced_losses / torch.distributed.get_world_size()
+
+    return reduced_losses
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(
+        torch.cuda.memory_allocated() / mega_bytes)
+    string += ' | max allocated: {}'.format(
+        torch.cuda.max_memory_allocated() / mega_bytes)
+    string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
+    string += ' | max reserved: {}'.format(
+        torch.cuda.max_memory_reserved() / mega_bytes)
+    print_rank_0(string)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, model-parallel,min, max, norm\n'
+    optimizer_ = optimizer
+    if isinstance(optimizer, FP16_Optimizer):
+        optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = param.data.norm()
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+def check_adlr_autoresume_termination(iteration, model,
+                                      optimizer, lr_scheduler):
+    """Check for autoresume signal and exit if it is received."""
+    args = get_args()
+    autoresume = get_adlr_autoresume()
+    # Add barrier to ensure consistnecy.
+    torch.distributed.barrier()
+    if autoresume.termination_requested():
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, lr_scheduler)
+        print_rank_0(">>> autoresume termination request found!")
+        if torch.distributed.get_rank() == 0:
+            autoresume.request_resume()
+        print_rank_0(">>> training terminated. Returning")
+        sys.exit(0)
+
+
+def make_data_loader(dataset):
+    """Buld dataloader given an input dataset."""
+    if dataset is None:
+        return None
+    args = get_args()
+
+    # Data parallel arguments.
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    global_batch_size = args.batch_size * world_size
+    num_workers = args.num_workers
+
+    # Use a simple sampler with distributed batch sampler.
+    sampler = torch.utils.data.SequentialSampler(dataset)
+    batch_sampler = DistributedBatchSampler(sampler=sampler,
+                                            batch_size=global_batch_size,
+                                            drop_last=True,
+                                            rank=rank,
+                                            world_size=world_size)
+    # Torch dataloader.
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=num_workers,
+                                       pin_memory=True)
+
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+
+    return attention_mask, loss_mask, position_ids
+
+
diff --git a/model/model.py b/model/model.py
deleted file mode 100755
index eaf00a37..00000000
--- a/model/model.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utilities for wrapping BertModel."""
-
-import torch
-
-from .modeling import BertConfig
-from .modeling import BertForPreTraining
-from .modeling import BertLayerNorm
-
-
-def get_params_for_weight_decay_optimization(module):
-
-    weight_decay_params = {'params': []}
-    no_weight_decay_params = {'params': [], 'weight_decay': 0}
-    for module_ in module.modules():
-        if isinstance(module_, (BertLayerNorm, torch.nn.LayerNorm)):
-            no_weight_decay_params['params'].extend(
-                [p for p in list(module_._parameters.values())
-                 if p is not None])
-        else:
-            weight_decay_params['params'].extend(
-                [p for n, p in list(module_._parameters.items())
-                 if p is not None and n != 'bias'])
-            no_weight_decay_params['params'].extend(
-                [p for n, p in list(module_._parameters.items())
-                 if p is not None and n == 'bias'])
-
-    return weight_decay_params, no_weight_decay_params
-
-
-class BertModel(torch.nn.Module):
-
-    def __init__(self, tokenizer, args):
-        super(BertModel, self).__init__()
-        if args.pretrained_bert:
-            self.model = BertForPreTraining.from_pretrained(
-                args.tokenizer_model_type,
-                cache_dir=args.cache_dir,
-                fp32_layernorm=args.fp32_layernorm,
-                fp32_embedding=args.fp32_embedding,
-                layernorm_epsilon=args.layernorm_epsilon)
-        else:
-            if args.intermediate_size is None:
-                intermediate_size = 4 * args.hidden_size
-            else:
-                intermediate_size = args.intermediate_size
-            self.config = BertConfig(
-                tokenizer.num_tokens,
-                hidden_size=args.hidden_size,
-                num_hidden_layers=args.num_layers,
-                num_attention_heads=args.num_attention_heads,
-                intermediate_size=intermediate_size,
-                hidden_dropout_prob=args.hidden_dropout,
-                attention_probs_dropout_prob=args.attention_dropout,
-                max_position_embeddings=args.max_position_embeddings,
-                type_vocab_size=tokenizer.num_type_tokens,
-                fp32_layernorm=args.fp32_layernorm,
-                fp32_embedding=args.fp32_embedding,
-                fp32_tokentypes=args.fp32_tokentypes,
-                layernorm_epsilon=args.layernorm_epsilon)
-            self.model = BertForPreTraining(self.config)
-
-    def forward(self, input_tokens, token_type_ids=None,
-                attention_mask=None, checkpoint_activations=False):
-        return self.model(
-            input_tokens, token_type_ids, attention_mask,
-            checkpoint_activations=checkpoint_activations)
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        return self.model.state_dict(destination=destination, prefix=prefix,
-                                     keep_vars=keep_vars)
-
-    def load_state_dict(self, state_dict, strict=True):
-        return self.model.load_state_dict(state_dict, strict=strict)
diff --git a/model/modeling.py b/model/modeling.py
deleted file mode 100644
index c78fc364..00000000
--- a/model/modeling.py
+++ /dev/null
@@ -1,1314 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import os
-import copy
-import json
-import math
-import logging
-import tarfile
-import tempfile
-import shutil
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss
-
-from torch.utils.checkpoint import checkpoint
-
-from data_utils.file_utils import cached_path
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
-}
-CONFIG_NAME = 'bert_config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
-TF_WEIGHTS_NAME = 'model.ckpt'
-
-def load_tf_weights_in_bert(model, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    """Implementation of the gelu activation function.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
-
-class BertConfig(object):
-    """Configuration class to store the configuration of a `BertModel`.
-    """
-    def __init__(self,
-                 vocab_size_or_config_json_file,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 fp32_layernorm=False,
-                 fp32_embedding=False,
-                 fp32_tokentypes=False,
-                 layernorm_epsilon=1e-12):
-        """Constructs BertConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
-        if isinstance(vocab_size_or_config_json_file, str):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.fp32_layernorm = fp32_layernorm
-            self.fp32_embedding = fp32_embedding
-            self.layernorm_epsilon = layernorm_epsilon
-            self.fp32_tokentypes = fp32_tokentypes
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `BertConfig` from a Python dictionary of parameters."""
-        config = BertConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-# try:
-#     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
-# except ImportError:
-#     print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
-#     class BertLayerNorm(nn.Module):
-#         def __init__(self, hidden_size, eps=1e-12):
-#             """Construct a layernorm module in the TF style (epsilon inside the square root).
-#             """
-#             super(BertLayerNorm, self).__init__()
-#             self.weight = nn.Parameter(torch.ones(hidden_size))
-#             self.bias = nn.Parameter(torch.zeros(hidden_size))
-#             self.variance_epsilon = eps
-
-#         def forward(self, x):
-#             u = x.mean(-1, keepdim=True)
-#             s = (x - u).pow(2).mean(-1, keepdim=True)
-#             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-#             return self.weight * x + self.bias
-
-class BertLayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-12):
-        """Construct a layernorm module in the TF style (epsilon inside the square root).
-        """
-        super(BertLayerNorm, self).__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.bias = nn.Parameter(torch.zeros(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, x):
-        u = x.mean(-1, keepdim=True)
-        s = (x - u).pow(2).mean(-1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-        return self.weight * x + self.bias
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.fp32_layernorm = config.fp32_layernorm
-        self.fp32_embedding = config.fp32_embedding
-        self.fp32_tokentypes = config.fp32_tokentypes
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None):
-        seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        if not self.fp32_tokentypes:
-
-            embeddings = words_embeddings + position_embeddings + token_type_embeddings
-            if self.fp32_embedding and not self.fp32_layernorm:
-                embeddings = embeddings.half()
-            previous_type = embeddings.type()
-            if self.fp32_layernorm:
-                embeddings = embeddings.float()
-            embeddings = self.LayerNorm(embeddings)
-            if self.fp32_layernorm:
-                if self.fp32_embedding:
-                    embeddings = embeddings.half()
-                else:
-                    embeddings = embeddings.type(previous_type)
-        else:
-            embeddings = words_embeddings.float() + position_embeddings.float() + token_type_embeddings.float()    
-            if self.fp32_tokentypes and not self.fp32_layernorm:
-                embeddings = embeddings.half()
-            previous_type = embeddings.type()
-            if self.fp32_layernorm:
-                embeddings = embeddings.float()
-            embeddings = self.LayerNorm(embeddings)
-            if self.fp32_layernorm:
-                if self.fp32_tokentypes:
-                    embeddings = embeddings.half()
-                else:
-                    embeddings = embeddings.type(previous_type)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-    def __init__(self, config):
-        super(BertSelfAttention, self).__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self, hidden_states, attention_mask):
-
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-        
-        previous_type = attention_probs.type()
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-        return context_layer
-
-
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super(BertSelfOutput, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.fp32_layernorm = config.fp32_layernorm
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        ln_input = hidden_states + input_tensor
-        previous_type = ln_input.type()
-        if self.fp32_layernorm:
-            ln_input = ln_input.float()
-        hidden_states = self.LayerNorm(ln_input)
-        if self.fp32_layernorm:
-            hidden_states = hidden_states.type(previous_type)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-    def __init__(self, config):
-        super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-
-    def forward(self, input_tensor, attention_mask):
-        self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
-
-
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super(BertIntermediate, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super(BertOutput, self).__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.fp32_layernorm = config.fp32_layernorm
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        ln_input = hidden_states + input_tensor
-        previous_type = ln_input.type()
-        if self.fp32_layernorm:
-            ln_input = ln_input.float()
-        hidden_states = self.LayerNorm(ln_input)
-        if self.fp32_layernorm:
-            hidden_states = hidden_states.type(previous_type)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-    def __init__(self, config):
-        super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(self, hidden_states, attention_mask):
-        attention_output = self.attention(hidden_states, attention_mask)
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super(BertEncoder, self).__init__()
-        layer = BertLayer(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
-
-    # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
-    #     all_encoder_layers = []
-    #     for layer_module in self.layer:
-    #         hidden_states = layer_module(hidden_states, attention_mask)
-    #         if output_all_encoded_layers:
-    #             all_encoder_layers.append(hidden_states)
-    #     if not output_all_encoded_layers:
-    #         all_encoder_layers.append(hidden_states)
-    #     return all_encoder_layers
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
-        all_encoder_layers = []
-        def custom(start, end):
-            def custom_forward(*inputs):
-                layers = self.layer[start:end]
-                x_ = inputs[0]
-                for layer in layers:
-                    x_ = layer(x_, inputs[1])
-                return x_
-            return custom_forward
-
-        if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
-                l += chunk_length
-            # decoder layers
-        else:
-            for i,layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states, attention_mask)
-
-                if output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-
-        if not output_all_encoded_layers or checkpoint_activations:
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super(BertPooler, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super(BertPredictionHeadTransform, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.transform_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
-        self.fp32_layernorm = config.fp32_layernorm
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        previous_type = hidden_states.type()
-        if self.fp32_layernorm:
-            hidden_states = hidden_states.float()
-        hidden_states = self.LayerNorm(hidden_states)
-        if self.fp32_layernorm:
-            hidden_states = hidden_states.type(previous_type)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertLMPredictionHead, self).__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
-                                 bert_model_embedding_weights.size(0),
-                                 bias=False)
-        self.decoder.weight = bert_model_embedding_weights
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
-        self.fp32_embedding = config.fp32_embedding
-        self.fp32_layernorm = config.fp32_layernorm
-        def convert_to_type(tensor):
-            if self.fp32_embedding:
-                return tensor.half()
-            else:
-                return tensor
-        self.type_converter = convert_to_type
-        self.converted = False
-
-    def forward(self, hidden_states):
-        if not self.converted:
-            self.converted = True
-            if self.fp32_embedding:
-                self.transform.half()
-                if self.fp32_layernorm:
-                    self.transform.LayerNorm.float()
-        hidden_states = self.transform(self.type_converter(hidden_states))
-        # hidden_states = self.decoder(hidden_states) + self.bias
-        hidden_states = F.linear(self.type_converter(hidden_states), self.type_converter(self.decoder.weight), self.type_converter(self.bias))
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super(BertOnlyNSPHead, self).__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
-        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        for p in self.seq_relationship.parameters():
-            if p is None:
-                continue
-            pooled_output = pooled_output.type_as(p)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class PreTrainedBertModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(PreTrainedBertModel, self).__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
-
-    def init_bert_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None,
-                        fp32_layernorm=False, fp32_embedding=False, layernorm_epsilon=1e-12,
-                        fp32_tokentypes=False, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `bert-base-uncased`
-                    . `bert-large-uncased`
-                    . `bert-base-cased`
-                    . `bert-large-cased`
-                    . `bert-base-multilingual-uncased`
-                    . `bert-base-multilingual-cased`
-                    . `bert-base-chinese`
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
-        else:
-            archive_file = pretrained_model_name
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except FileNotFoundError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
-            return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
-        else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file):
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
-        # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
-        config.fp32_layernorm = fp32_layernorm
-        config.fp32_embedding = fp32_embedding
-        config.layernorm_epsilon = layernorm_epsilon
-        config.fp32_tokentypes = fp32_tokentypes
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path)
-
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
-        return model
-
-
-class BertModel(PreTrainedBertModel):
-    """BERT model ("Bidirectional Embedding Representations from a Transformer").
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertModel, self).__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, checkpoint_activations=False):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.encoder.parameters()).dtype) # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(embedding_output,
-                                      extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers,
-                                      checkpoint_activations=checkpoint_activations)
-        sequence_output = encoded_layers[-1]
-        for p in self.pooler.parameters():
-            if p is None:
-                continue
-            sequence_output = sequence_output.type_as(p)
-            break
-        pooled_output = self.pooler(sequence_output)
-        if not output_all_encoded_layers or checkpoint_activations:
-            encoded_layers = encoded_layers[-1]
-        return encoded_layers, pooled_output
-
-
-class BertForPreTraining(PreTrainedBertModel):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
-            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-            - the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForPreTraining, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False):
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
-        else:
-            return prediction_scores, seq_relationship_score
-
-
-class BertForMaskedLM(PreTrainedBertModel):
-    """BERT model with the masked language modeling head.
-    This module comprises the BERT model followed by the masked language modeling head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-
-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        prediction_scores = self.cls(sequence_output)
-
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            return masked_lm_loss
-        else:
-            return prediction_scores
-
-
-class BertForNextSentencePrediction(PreTrainedBertModel):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        seq_relationship_score = self.cls( pooled_output)
-
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
-            return seq_relationship_score
-
-
-class BertForSequenceClassification(PreTrainedBertModel):
-    """BERT model for classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels=2):
-        super(BertForSequenceClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForMultipleChoice(PreTrainedBertModel):
-    """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_choices`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
-            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_choices = 2
-
-    model = BertForMultipleChoice(config, num_choices)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_choices=2):
-        super(BertForMultipleChoice, self).__init__(config)
-        self.num_choices = num_choices
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, self.num_choices)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
-            return reshaped_logits
-
-
-class BertForTokenClassification(PreTrainedBertModel):
-    """BERT model for token-level classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the full hidden state of the last layer.
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-        `num_labels`: the number of classes for the classifier. Default = 2.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_labels].
-
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForTokenClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config, num_labels=2):
-        super(BertForTokenClassification, self).__init__(config)
-        self.num_labels = num_labels
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        else:
-            return logits
-
-
-class BertForQuestionAnswering(PreTrainedBertModel):
-    """BERT model for Question Answering (span extraction).
-    This module is composed of the BERT model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
-
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-
-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-        self.apply(self.init_bert_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
-            return start_logits, end_logits
-
-
-
-
diff --git a/optim/adam.py b/optim/adam.py
deleted file mode 100755
index b96c288b..00000000
--- a/optim/adam.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-from torch.optim import Optimizer
-
-
-class Adam(Optimizer):
-    r"""Implements Adam algorithm.
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, amsgrad=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad)
-        super(Adam, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Adam, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                amsgrad = group['amsgrad']
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                if amsgrad:
-                    max_exp_avg_sq = state['max_exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                if amsgrad:
-                    # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
-                    # Use the max. for normalizing running avg. of gradient
-                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
-                else:
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
-
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-                step_size = group['lr']# * math.sqrt(bias_correction2) / bias_correction1
-                if group['weight_decay'] != 0:
-                    p.data.add_(-step_size * group['weight_decay'], p.data)
-
-                p.data.addcdiv_(-step_size, exp_avg, denom)
-
-        return loss
diff --git a/pretrain_bert.py b/pretrain_bert.py
old mode 100755
new mode 100644
index 3100e789..b937b36f
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,476 +15,109 @@
 
 """Pretrain BERT"""
 
-import os
-import random
-import numpy as np
 import torch
+import torch.nn.functional as F
 
-from arguments import get_args
-from configure_data import configure_data
-from fp16 import FP16_Module
-from fp16 import FP16_Optimizer
-from learning_rates import AnnealingLR
-from model import BertModel
-from model import get_params_for_weight_decay_optimization
-from model import DistributedDataParallel as DDP
-from optim import Adam
-from utils import Timers
-from utils import save_checkpoint
-from utils import load_checkpoint
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.model import BertModel
+from megatron.training import pretrain
+from megatron.utils import reduce_losses
 
 
-def get_model(tokenizer, args):
+def model_provider():
     """Build the model."""
 
-    print('building BERT model ...')
-    model = BertModel(tokenizer, args)
-    print(' > number of parameters: {}'.format(
-        sum([p.nelement() for p in model.parameters()])), flush=True)
+    print_rank_0('building BERT model ...')
 
-    # GPU allocation.
-    model.cuda(torch.cuda.current_device())
-
-    # Fp16 conversion.
-    if args.fp16:
-        model = FP16_Module(model)
-        if args.fp32_embedding:
-            model.module.model.bert.embeddings.word_embeddings.float()
-            model.module.model.bert.embeddings.position_embeddings.float()
-            model.module.model.bert.embeddings.token_type_embeddings.float()
-        if args.fp32_tokentypes:
-            model.module.model.bert.embeddings.token_type_embeddings.float()
-        if args.fp32_layernorm:
-            for name, _module in model.named_modules():
-                if 'LayerNorm' in name:
-                    _module.float()
-
-    # Wrap model for distributed training.
-    if args.world_size > 1:
-        model = DDP(model)
+    model = BertModel(
+        num_tokentypes=2,
+        add_binary_head=True,
+        parallel_output=True)
 
     return model
 
 
-def get_optimizer(model, args):
-    """Set up the optimizer."""
-
-    # Build parameter groups (weight decay and non-decay).
-    while isinstance(model, (DDP, FP16_Module)):
-        model = model.module
-    layers = model.model.bert.encoder.layer
-    pooler = model.model.bert.pooler
-    lmheads = model.model.cls.predictions
-    nspheads = model.model.cls.seq_relationship
-    embeddings = model.model.bert.embeddings
-    param_groups = []
-    param_groups += list(get_params_for_weight_decay_optimization(layers))
-    param_groups += list(get_params_for_weight_decay_optimization(pooler))
-    param_groups += list(get_params_for_weight_decay_optimization(nspheads))
-    param_groups += list(get_params_for_weight_decay_optimization(embeddings))
-    param_groups += list(get_params_for_weight_decay_optimization(
-        lmheads.transform))
-    param_groups[1]['params'].append(lmheads.bias)
-
-    # Use Adam.
-    optimizer = Adam(param_groups,
-                     lr=args.lr, weight_decay=args.weight_decay)
-
-    # Wrap into fp16 optimizer.
-    if args.fp16:
-        optimizer = FP16_Optimizer(optimizer,
-                                   static_loss_scale=args.loss_scale,
-                                   dynamic_loss_scale=args.dynamic_loss_scale,
-                                   dynamic_loss_args={
-                                       'scale_window': args.loss_scale_window,
-                                       'min_scale':args.min_scale,
-                                       'delayed_shift': args.hysteresis})
+def get_batch(data_iterator):
+    """Build the batch."""
 
-    return optimizer
+    # Items and their type.
+    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask']
+    datatype = torch.int64
 
-
-def get_learning_rate_scheduler(optimizer, args):
-    """Build the learning rate scheduler."""
-
-    # Add linear learning rate scheduler.
-    if args.lr_decay_iters is not None:
-        num_iters = args.lr_decay_iters
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
     else:
-        num_iters = args.train_iters * args.epochs
-    init_step = -1
-    warmup_iter = args.warmup * num_iters
-    lr_scheduler = AnnealingLR(optimizer,
-                               start_lr=args.lr,
-                               warmup_iter=warmup_iter,
-                               num_iters=num_iters,
-                               decay_style=args.lr_decay_style,
-                               last_iter=init_step)
-
-    return lr_scheduler
-
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
 
-def setup_model_and_optimizer(args, tokenizer):
-    """Setup model and optimizer."""
+    # Unpack.
+    tokens = data_b['text'].long()
+    types = data_b['types'].long()
+    sentence_order = data_b['is_random'].long()
+    loss_mask = data_b['loss_mask'].float()
+    lm_labels = data_b['labels'].long()
+    padding_mask = data_b['padding_mask'].long()
 
-    model = get_model(tokenizer, args)
-    optimizer = get_optimizer(model, args)
-    lr_scheduler = get_learning_rate_scheduler(optimizer, args)
-    criterion = torch.nn.CrossEntropyLoss(reduce=False, ignore_index=-1)
+    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask
 
-    if args.load is not None:
-        epoch, i, total_iters = load_checkpoint(model, optimizer,
-                                                lr_scheduler, args)
-        if args.resume_dataloader:
-            args.epoch = epoch
-            args.mid_epoch_iters = i
-            args.total_iters = total_iters
 
-    return model, optimizer, lr_scheduler, criterion
-
-
-def get_batch(data):
-    ''' get_batch subdivides the source data into chunks of
-    length args.seq_length. If source is equal to the example
-    output of the data loading example, with a seq_length limit
-    of 2, we'd get the following two Variables for i = 0:
-    ┌ a g m s ┐ ┌ b h n t ┐
-    └ b h n t ┘ └ c i o u ┘
-    Note that despite the name of the function, the subdivison of data is not
-    done along the batch dimension (i.e. dimension 1), since that was handled
-    by the data loader. The chunks are along dimension 0, corresponding
-    to the seq_len dimension in the LSTM. A Variable representing an appropriate
-    shard reset mask of the same dimensions is also returned.
-    '''
-    tokens = torch.autograd.Variable(data['text'].long())
-    types = torch.autograd.Variable(data['types'].long())
-    next_sentence = torch.autograd.Variable(data['is_random'].long())
-    loss_mask = torch.autograd.Variable(data['mask'].float())
-    lm_labels = torch.autograd.Variable(data['mask_labels'].long())
-    padding_mask = torch.autograd.Variable(data['pad_mask'].byte())
-    # Move to cuda
-    tokens = tokens.cuda()
-    types = types.cuda()
-    next_sentence = next_sentence.cuda()
-    loss_mask = loss_mask.cuda()
-    lm_labels = lm_labels.cuda()
-    padding_mask = padding_mask.cuda()
-
-    return tokens, types, next_sentence, loss_mask, lm_labels, padding_mask
-
-
-def forward_step(data, model, criterion, args):
+def forward_step(data_iterator, model):
     """Forward step."""
+    args = get_args()
+    timers = get_timers()
 
     # Get the batch.
-    tokens, types, next_sentence, loss_mask, lm_labels, \
-        padding_mask = get_batch(data)
-    # Forward model.
-    output, nsp = model(tokens, types, 1-padding_mask,
-                        checkpoint_activations=args.checkpoint_activations)
-    nsp_loss = criterion(nsp.view(-1, 2).contiguous().float(),
-                         next_sentence.view(-1).contiguous()).mean()
-    losses = criterion(output.view(-1, args.data_size).contiguous().float(),
-                       lm_labels.contiguous().view(-1).contiguous())
-    loss_mask = loss_mask.contiguous()
-    loss_mask = loss_mask.view(-1)
-    lm_loss = torch.sum(
-        losses * loss_mask.view(-1).float()) / loss_mask.sum()
-
-    return lm_loss, nsp_loss
-
-
-def backward_step(optimizer, model, lm_loss, nsp_loss, args):
-    """Backward step."""
-
-    # Total loss.
-    loss = lm_loss + nsp_loss
-
-    # Backward pass.
-    optimizer.zero_grad()
-    if args.fp16:
-        optimizer.backward(loss, update_master_grads=False)
-    else:
-        loss.backward()
-
-    # Reduce across processes.
-    lm_loss_reduced = lm_loss
-    nsp_loss_reduced = nsp_loss
-    if args.world_size > 1:
-        reduced_losses = torch.cat((lm_loss.view(1), nsp_loss.view(1)))
-        torch.distributed.all_reduce(reduced_losses.data)
-        reduced_losses.data = reduced_losses.data / args.world_size
-        model.allreduce_params(reduce_after=False,
-                               fp32_allreduce=args.fp32_allreduce)
-        lm_loss_reduced = reduced_losses[0]
-        nsp_loss_reduced = reduced_losses[1]
-
-    # Update master gradients.
-    if args.fp16:
-        optimizer.update_master_grads()
-
-    # Clipping gradients helps prevent the exploding gradient.
-    if args.clip_grad > 0:
-        if not args.fp16:
-            torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad)
-        else:
-            optimizer.clip_master_grads(args.clip_grad)
-
-    return lm_loss_reduced, nsp_loss_reduced
-
-
-def train_step(input_data, model, criterion, optimizer, lr_scheduler, args):
-    """Single training step."""
-
-    # Forward model for one step.
-    lm_loss, nsp_loss = forward_step(input_data, model, criterion, args)
-
-    # Calculate gradients, reduce across processes, and clip.
-    lm_loss_reduced, nsp_loss_reduced = backward_step(optimizer, model, lm_loss,
-                                                      nsp_loss, args)
-
-    # Update parameters.
-    optimizer.step()
-
-    # Update learning rate.
-    skipped_iter = 0
-    if not (args.fp16 and optimizer.overflow):
-        lr_scheduler.step()
-    else:
-        skipped_iter = 1
-
-    return lm_loss_reduced, nsp_loss_reduced, skipped_iter
+    timers('batch generator').start()
+    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \
+        = get_batch(data_iterator)
+    timers('batch generator').stop()
 
+    # Forward model. lm_labels
+    lm_loss_, sop_logits = model(tokens, padding_mask,
+                                 tokentype_ids=types,
+                                 lm_labels=lm_labels)
 
-def train_epoch(epoch, model, optimizer, train_data,
-                lr_scheduler, criterion, timers, args):
-    """Train one full epoch."""
-
-    # Turn on training mode which enables dropout.
-    model.train()
-
-    # Tracking loss.
-    total_lm_loss = 0.0
-    total_nsp_loss = 0.0
-
-    # Iterations.
-    max_iters = args.train_iters
-    iteration = 0
-    skipped_iters = 0
-    if args.resume_dataloader:
-        iteration = args.mid_epoch_iters
-        args.resume_dataloader = False
-
-    # Data iterator.
-    data_iterator = iter(train_data)
-
-    timers('interval time').start()
-    while iteration < max_iters:
-
-        lm_loss, nsp_loss, skipped_iter = train_step(next(data_iterator),
-                                                     model,
-                                                     criterion,
-                                                     optimizer,
-                                                     lr_scheduler,
-                                                     args)
-        skipped_iters += skipped_iter
-        iteration += 1
-
-        # Update losses.
-        total_lm_loss += lm_loss.data.detach().float()
-        total_nsp_loss += nsp_loss.data.detach().float()
-
-        # Logging.
-        if iteration % args.log_interval == 0:
-            learning_rate = optimizer.param_groups[0]['lr']
-            avg_nsp_loss = total_nsp_loss.item() / args.log_interval
-            avg_lm_loss = total_lm_loss.item() / args.log_interval
-            elapsed_time = timers('interval time').elapsed()
-            log_string = ' epoch{:2d} |'.format(epoch)
-            log_string += ' iteration {:8d}/{:8d} |'.format(iteration,
-                                                            max_iters)
-            log_string += ' elapsed time per iteration (ms): {:.1f} |'.format(
-                elapsed_time * 1000.0 / args.log_interval)
-            log_string += ' learning rate {:.3E} |'.format(learning_rate)
-            log_string += ' lm loss {:.3E} |'.format(avg_lm_loss)
-            log_string += ' nsp loss {:.3E} |'.format(avg_nsp_loss)
-            if args.fp16:
-                log_string += ' loss scale {:.1f} |'.format(
-                    optimizer.loss_scale)
-            print(log_string, flush=True)
-            total_nsp_loss = 0.0
-            total_lm_loss = 0.0
-
-        # Checkpointing
-        if args.save and args.save_iters and iteration % args.save_iters == 0:
-            total_iters = args.train_iters * (epoch-1) + iteration
-            model_suffix = 'model/%d.pt' % (total_iters)
-            save_checkpoint(model_suffix, epoch, iteration, model, optimizer,
-                            lr_scheduler, args)
-
-    return iteration, skipped_iters
-
-
-def evaluate(data_source, model, criterion, args):
-    """Evaluation."""
-
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-
-    total_lm_loss = 0
-    total_nsp_loss = 0
-    max_iters = args.eval_iters
-
-    with torch.no_grad():
-        data_iterator = iter(data_source)
-        iteration = 0
-        while iteration < max_iters:
-            # Forward evaluation.
-            lm_loss, nsp_loss = forward_step(next(data_iterator), model,
-                                             criterion, args)
-            # Reduce across processes.
-            if isinstance(model, DDP):
-                reduced_losses = torch.cat((lm_loss.view(1), nsp_loss.view(1)))
-                torch.distributed.all_reduce(reduced_losses.data)
-                reduced_losses.data = reduced_losses.data/args.world_size
-                lm_loss = reduced_losses[0]
-                nsp_loss = reduced_losses[1]
-
-            total_lm_loss += lm_loss.data.detach().float().item()
-            total_nsp_loss += nsp_loss.data.detach().float().item()
-            iteration += 1
-
-    # Move model back to the train mode.
-    model.train()
-
-    total_lm_loss /= max_iters
-    total_nsp_loss /= max_iters
-    return total_lm_loss, total_nsp_loss
-
-
-def initialize_distributed(args):
-    """Initialize torch.distributed."""
-
-    # Manually set the device ids.
-    device = args.rank % torch.cuda.device_count()
-    if args.local_rank is not None:
-        device = args.local_rank
-    torch.cuda.set_device(device)
-    # Call the init process
-    if args.world_size > 1:
-        init_method = 'tcp://'
-        master_ip = os.getenv('MASTER_ADDR', 'localhost')
-        master_port = os.getenv('MASTER_PORT', '6000')
-        init_method += master_ip + ':' + master_port
-        torch.distributed.init_process_group(
-            backend=args.distributed_backend,
-            world_size=args.world_size, rank=args.rank,
-            init_method=init_method)
-
-
-def set_random_seed(seed):
-    """Set random seed for reproducability."""
-
-    if seed is not None and seed > 0:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
+    sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(),
+                               sentence_order.view(-1),
+                               ignore_index=-1)
 
+    lm_loss = torch.sum(
+        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
 
-def main():
-    """Main training program."""
+    loss = lm_loss + sop_loss
 
-    print('Pretrain BERT model')
+    reduced_losses = reduce_losses([lm_loss, sop_loss])
 
-    # Disable CuDNN.
-    torch.backends.cudnn.enabled = False
+    return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]}
 
-    # Timer.
-    timers = Timers()
 
-    # Arguments.
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
     args = get_args()
 
-    # Pytorch distributed.
-    initialize_distributed(args)
+    print_rank_0('> building train, validation, and test datasets '
+                 'for BERT ...')
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        max_seq_length=args.seq_length,
+        masked_lm_prob=args.mask_prob,
+        short_seq_prob=args.short_seq_prob,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup))
+    print_rank_0("> finished creating BERT datasets ...")
 
-    # Random seeds for reproducability.
-    set_random_seed(args.seed)
-
-    # Data stuff.
-    data_config = configure_data()
-    data_config.set_defaults(data_set_type='BERT', transpose=False)
-    (train_data, val_data, test_data), tokenizer = data_config.apply(args)
-    args.data_size = tokenizer.num_tokens
-
-    # Model, optimizer, and learning rate.
-    model, optimizer, lr_scheduler, criterion = setup_model_and_optimizer(
-        args, tokenizer)
-
-    # At any point you can hit Ctrl + C to break out of training early.
-    try:
-        total_iters = 0
-        skipped_iters = 0
-        start_epoch = 1
-        best_val_loss = float('inf')
-        # Resume data loader if necessary.
-        if args.resume_dataloader:
-            start_epoch = args.epoch
-            total_iters = args.total_iters
-            train_data.batch_sampler.start_iter = total_iters % len(train_data)
-        # For all epochs.
-        for epoch in range(start_epoch, args.epochs+1):
-            timers('epoch time').start()
-            iteration, skipped = train_epoch(epoch, model, optimizer,
-                                             train_data, lr_scheduler,
-                                             criterion, timers, args)
-            elapsed_time = timers('epoch time').elapsed()
-            total_iters += iteration
-            skipped_iters += skipped
-            lm_loss, nsp_loss = evaluate(val_data, model, criterion, args)
-            val_loss = lm_loss + nsp_loss
-            print('-' * 100)
-            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:.4E} | '
-                  'valid LM Loss {:.4E} | valid NSP Loss {:.4E}'.format(
-                      epoch, elapsed_time, val_loss, lm_loss, nsp_loss))
-            print('-' * 100)
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
-                if args.save:
-                    best_path = 'best/model.pt'
-                    print('saving best model to:',
-                           os.path.join(args.save, best_path))
-                    save_checkpoint(best_path, epoch+1, total_iters, model,
-                                    optimizer, lr_scheduler, args)
-
-
-    except KeyboardInterrupt:
-        print('-' * 100)
-        print('Exiting from training early')
-        if args.save:
-            cur_path = 'current/model.pt'
-            print('saving current model to:',
-                   os.path.join(args.save, cur_path))
-            save_checkpoint(cur_path, epoch, total_iters, model, optimizer,
-                            lr_scheduler, args)
-        exit()
-
-    if args.save:
-        final_path = 'final/model.pt'
-        print('saving final model to:', os.path.join(args.save, final_path))
-        save_checkpoint(final_path, args.epochs, total_iters, model, optimizer,
-                        lr_scheduler, args)
-
-    if test_data is not None:
-        # Run on test data.
-        print('entering test')
-        lm_loss, nsp_loss = evaluate(test_data, model, criterion, args)
-        test_loss = lm_loss + nsp_loss
-        print('=' * 100)
-        print('| End of training | test loss {:5.4f} | valid LM Loss {:.4E} |'
-              ' valid NSP Loss {:.4E}'.format(test_loss, lm_loss, nsp_loss))
-        print('=' * 100)
+    return train_ds, valid_ds, test_ds
 
 
 if __name__ == "__main__":
-    main()
+
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/pretrain_gpt2.py b/pretrain_gpt2.py
new file mode 100644
index 00000000..57bcc8a3
--- /dev/null
+++ b/pretrain_gpt2.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain GPT2"""
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.data.gpt2_dataset import build_train_valid_test_datasets
+from megatron.model import GPT2Model, GPT2ModelPipe
+from megatron.training import pretrain
+from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.utils import reduce_losses
+from megatron.fp16 import fp32_to_fp16
+
+def model_provider():
+    """Build the model."""
+
+    args = get_args()
+
+    print_rank_0('building GPT2 model ...')
+    if args.pipe_parallel_size == 0:
+        model = GPT2Model(num_tokentypes=0, parallel_output=True)
+    else:
+        model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology())
+        # This is a hack to give us a reference to get_batch_pipe from within training.py
+        # We need to call model.set_batch_fn after deepspeed.initialize
+        model._megatron_batch_fn = get_batch_pipe
+
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, loss_mask, attention_mask, position_ids
+
+def get_batch_pipe(data):
+    """A modification of get_batch() to work with the latest batch instead of an iterator. """
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    # unpack data
+    if args.fp16:
+        # cast to fp16 because pipeline parallelism skips the FP16 wrapper.
+        return fp32_to_fp16((tokens, position_ids, attention_mask)), fp32_to_fp16((labels, loss_mask))
+    else:
+        return (tokens, position_ids, attention_mask), (labels, loss_mask)
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch generator').start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data_iterator)
+    timers('batch generator').stop()
+    # Forward model.
+    losses = model(tokens, position_ids, attention_mask, labels=labels)
+    loss_mask = loss_mask.view(-1)
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+
+    # Reduce loss for logging.
+    reduced_loss = reduce_losses([loss])
+
+    return loss, {'lm loss': reduced_loss[0]}
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for GPT2 ...')
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup))
+    print_rank_0("> finished creating GPT2 datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
diff --git a/pretrain_ict.py b/pretrain_ict.py
new file mode 100644
index 00000000..05d6a9c3
--- /dev/null
+++ b/pretrain_ict.py
@@ -0,0 +1,138 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain BERT for Inverse Cloze Task"""
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.training import pretrain
+from megatron.utils import reduce_losses
+from megatron.model.realm_model import general_ict_model_provider
+from megatron.data.realm_dataset_utils import get_ict_batch
+
+
+def pretrain_ict_model_provider():
+    return general_ict_model_provider(False, False)
+
+
+def get_group_world_size_rank():
+
+    group = mpu.get_data_parallel_group()
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    return group, rank, world_size
+
+
+class AllgatherFromDataParallelRegion(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, input_):
+        assert input_.dim() == 2
+        group, rank, world_size = get_group_world_size_rank()
+
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        tensor_list[rank] = input_
+        torch.distributed.all_gather(tensor_list, input_, group=group)
+
+        output = torch.cat(tensor_list, dim=0).contiguous()
+
+        return output
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        group, rank, world_size = get_group_world_size_rank()
+
+        assert grad_output.shape[0] % world_size == 0
+        dim_size = grad_output.shape[0] // world_size
+        output_list = torch.split(grad_output, dim_size, dim=0)
+
+        # get chunk from this rank
+        output = output_list[rank].contiguous()
+        return output
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch generator').start()
+    query_tokens, query_pad_mask, \
+    block_tokens, block_pad_mask, block_indices = get_ict_batch(data_iterator)
+    timers('batch generator').stop()
+
+
+    # Forward model.
+    query_logits, block_logits = model(query_tokens, query_pad_mask, block_tokens, block_pad_mask)
+    local_batch_size = query_logits.shape[0]
+    global_batch_size = dist.get_world_size() * local_batch_size  # recall we assert that model_parallel_size == 1
+
+    all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits)
+    all_block_logits = AllgatherFromDataParallelRegion.apply(block_logits)
+
+    # scores are inner products between query and block embeddings
+    retrieval_scores = all_query_logits.float().matmul(torch.transpose(all_block_logits, 0, 1).float())
+    softmaxed = F.softmax(retrieval_scores, dim=1)
+    sorted_vals, sorted_indices = torch.topk(softmaxed, k=softmaxed.shape[1], sorted=True)
+
+    def topk_accuracy(k):
+        return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) for i in range(global_batch_size)]) / global_batch_size])
+
+    topk_accs = [topk_accuracy(int(k)) for k in args.report_topk_accuracies]
+    retrieval_loss = torch.nn.CrossEntropyLoss()(retrieval_scores, torch.arange(global_batch_size).long().cuda())
+    reduced_losses = reduce_losses([retrieval_loss, *topk_accs])
+
+    # create stats_dict with retrieval loss and all specified top-k accuracies
+    topk_acc_dict = {'top{}_acc'.format(k): v for k, v in zip(args.report_topk_accuracies, reduced_losses[1:])}
+    stats_dict = dict(retrieval_loss=reduced_losses[0], **topk_acc_dict)
+
+    return retrieval_loss, stats_dict
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid and test datasets."""
+    args = get_args()
+    print_rank_0('> building train, validation, and test datasets '
+                 'for BERT ICT...')
+
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        max_seq_length=args.seq_length,
+        masked_lm_prob=args.mask_prob,
+        short_seq_prob=args.short_seq_prob,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        dataset_type='ict')
+    print_rank_0("> finished creating BERT ICT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+    pretrain(train_valid_test_datasets_provider, pretrain_ict_model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/requirements.txt b/requirements.txt
index b4eb4b43..1f7389c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-nltk>=3.4
-numpy>=1.15.4
-pandas>=0.24.0
-sentencepiece>=0.1.8
-tensorflow>=1.12.0
+pybind11
+torch
+six
+regex
+numpy
diff --git a/scripts/pretrain_bert.sh b/scripts/pretrain_bert.sh
deleted file mode 100755
index fec4d2e6..00000000
--- a/scripts/pretrain_bert.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-RANK=0
-WORLD_SIZE=1
-
-python pretrain_bert.py \
-    --batch-size 4 \
-    --tokenizer-type BertWordPieceTokenizer \
-    --cache-dir cache_dir \
-    --tokenizer-model-type bert-large-uncased \
-    --vocab-size 30522 \
-    --train-data wikipedia \
-    --loose-json \
-    --text-key text \
-    --split 1000,1,1 \
-    --lazy-loader \
-    --max-preds-per-seq 80 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --intermediate-size 4096 \
-    --num-attention-heads 16 \
-    --hidden-dropout 0.1 \
-    --attention-dropout 0.1 \
-    --train-iters 1000000 \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --lr-decay-iters 990000 \
-    --warmup .01 \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --fp32-layernorm \
-    --fp32-embedding \
-    --hysteresis 2 \
-    --num-workers 2
diff --git a/scripts/pretrain_bert_distributed.sh b/scripts/pretrain_bert_distributed.sh
deleted file mode 100755
index 781c7d54..00000000
--- a/scripts/pretrain_bert_distributed.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-  pretrain_bert.py \
-    --batch-size 4 \
-    --tokenizer-type BertWordPieceTokenizer \
-    --cache-dir cache_dir \
-    --tokenizer-model-type bert-large-uncased \
-    --vocab-size 30522 \
-    --train-data wikipedia \
-    --loose-json \
-    --text-key text \
-    --split 1000,1,1 \
-    --lazy-loader \
-    --max-preds-per-seq 80 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --intermediate-size 4096 \
-    --num-attention-heads 16 \
-    --hidden-dropout 0.1 \
-    --attention-dropout 0.1 \
-    --train-iters 1000000 \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --lr-decay-iters 990000 \
-    --warmup .01 \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --fp32-layernorm \
-    --fp32-embedding \
-    --hysteresis 2 \
-    --num-workers 2
diff --git a/scripts/pretrain_bert_sentencepiece.sh b/scripts/pretrain_bert_sentencepiece.sh
deleted file mode 100755
index b659e38c..00000000
--- a/scripts/pretrain_bert_sentencepiece.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-RANK=0
-WORLD_SIZE=1
-
-python pretrain_bert.py \
-    --batch-size 4 \
-    --tokenizer-type SentencePieceTokenizer \
-    --tokenizer-model-type bpe \
-    --tokenizer-path tokenizer.model \
-    --vocab-size 30522 \
-    --train-data wikipedia \
-    --loose-json \
-    --text-key text \
-    --split 1000,1,1 \
-    --lazy-loader \
-    --max-preds-per-seq 80 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --intermediate-size 4096 \
-    --num-attention-heads 16 \
-    --hidden-dropout 0.1 \
-    --attention-dropout 0.1 \
-    --train-iters 1000000 \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --lr-decay-iters 990000 \
-    --warmup .01 \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --fp32-layernorm \
-    --fp32-embedding \
-    --hysteresis 2 \
-    --num-workers 2
diff --git a/scripts/pretrain_bert_tfrecords_distributed.sh b/scripts/pretrain_bert_tfrecords_distributed.sh
deleted file mode 100755
index cb52ba5f..00000000
--- a/scripts/pretrain_bert_tfrecords_distributed.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-  pretrain_bert.py \
-    --batch-size 4 \
-    --tokenizer-type BertWordPieceTokenizer \
-    --cache-dir cache_dir \
-    --tokenizer-model-type bert-large-uncased \
-    --vocab-size 30522 \
-    --use-tfrecords \
-    --train-data <TFRecord 1> <TFRecord 2> \
-    --valid-data <TF Record 3> \
-    --test-data <TF Record 4> \
-    --max-preds-per-seq 80 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --intermediate-size 4096 \
-    --num-attention-heads 16 \
-    --hidden-dropout 0.1 \
-    --attention-dropout 0.1 \
-    --train-iters 1000000 \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --lr-decay-iters 990000 \
-    --warmup .01 \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --fp32-layernorm \
-    --fp32-embedding \
-    --hysteresis 2 \
-    --num-workers 2
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..668a6b6c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Setup for pip package."""
+
+import os
+import sys
+import setuptools
+
+if sys.version_info < (3,):
+    raise Exception("Python 2 is not supported by Megatron.")
+
+from megatron.package_info import (
+    __description__,
+    __contact_names__,
+    __url__,
+    __download_url__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __version__,
+)
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+###############################################################################
+#                             Dependency Loading                              #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+
+
+def req_file(filename):
+    with open(filename) as f:
+        content = f.readlines()
+    return [x.strip() for x in content]
+
+
+install_requires = req_file("requirements.txt")
+
+setuptools.setup(
+    name=__package_name__,
+    # Versions should comply with PEP440.  For a discussion on single-sourcing
+    # the version across setup.py and the project code, see
+    # https://packaging.python.org/en/latest/single_source_version.html
+    version=__version__,
+    description=__description__,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    # The project's main homepage.
+    url=__url__,
+    author=__contact_names__,
+    maintainer=__contact_names__,
+    # The licence under which the project is released
+    license=__license__,
+    classifiers=[
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'Intended Audience :: Information Technology',
+        # Indicate what your project relates to
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        # Supported python versions
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        # Additional Setting
+        'Environment :: Console',
+        'Natural Language :: English',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+    packages=setuptools.find_packages(),
+    install_requires=install_requires,
+    # Add in any packaged data.
+    include_package_data=True,
+    zip_safe=False,
+    # PyPI package information.
+    keywords=__keywords__
+)
diff --git a/tasks/data_utils.py b/tasks/data_utils.py
new file mode 100644
index 00000000..866a5e69
--- /dev/null
+++ b/tasks/data_utils.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Tasks data utility."""
+
+import re
+import numpy as np
+
+
+def clean_text(text):
+    """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+    text = text.replace("\n", " ")
+    text = re.sub(r'\s+', ' ', text)
+    for _ in range(3):
+        text = text.replace(' . ', '. ')
+
+    return text
+
+
+def build_sample(ids, types, paddings, label, unique_id):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    ids_np = np.array(ids, dtype=np.int64)
+    types_np = np.array(types, dtype=np.int64)
+    paddings_np = np.array(paddings, dtype=np.int64)
+    sample = ({'text': ids_np,
+               'types': types_np,
+               'padding_mask': paddings_np,
+               'label': int(label),
+               'uid': int(unique_id)})
+
+    return sample
+
+
+def build_tokens_types_paddings_from_text(text_a, text_b,
+                                          tokenizer, max_seq_length):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    text_a_ids = tokenizer.tokenize(text_a)
+    text_b_ids = None
+    if text_b is not None:
+        text_b_ids = tokenizer.tokenize(text_b)
+
+    return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
+                                                max_seq_length, tokenizer.cls,
+                                                tokenizer.sep, tokenizer.pad)
+
+
+def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
+                                         cls_id, sep_id, pad_id):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    ids = []
+    types = []
+    paddings = []
+
+    # [CLS].
+    ids.append(cls_id)
+    types.append(0)
+    paddings.append(1)
+
+    # A.
+    len_text_a = len(text_a_ids)
+    ids.extend(text_a_ids)
+    types.extend([0] * len_text_a)
+    paddings.extend([1] * len_text_a)
+
+    # [SEP].
+    ids.append(sep_id)
+    types.append(0)
+    paddings.append(1)
+
+    # B.
+    if text_b_ids is not None:
+        len_text_b = len(text_b_ids)
+        ids.extend(text_b_ids)
+        types.extend([1] * len_text_b)
+        paddings.extend([1] * len_text_b)
+
+    # Cap the size.
+    trimmed = False
+    if len(ids) >= max_seq_length:
+        max_seq_length_m1 = max_seq_length - 1
+        ids = ids[0:max_seq_length_m1]
+        types = types[0:max_seq_length_m1]
+        paddings = paddings[0:max_seq_length_m1]
+        trimmed = True
+
+    # [SEP].
+    if (text_b_ids is not None) or trimmed:
+        ids.append(sep_id)
+        if text_b_ids is None:
+            types.append(0)
+        else:
+            types.append(1)
+        paddings.append(1)
+
+    # Padding.
+    padding_length = max_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([pad_id] * padding_length)
+        types.extend([pad_id] * padding_length)
+        paddings.extend([0] * padding_length)
+
+    return ids, types, paddings
diff --git a/tasks/ensemble_classifier.py b/tasks/ensemble_classifier.py
new file mode 100644
index 00000000..c2333b70
--- /dev/null
+++ b/tasks/ensemble_classifier.py
@@ -0,0 +1,149 @@
+import os
+import argparse
+import collections
+
+import numpy as np
+import torch
+
+
+def process_files(args):
+    all_predictions = collections.OrderedDict()
+    all_labels = collections.OrderedDict()
+    all_uid = collections.OrderedDict()
+    for path in args.paths:
+        path = os.path.join(path, args.prediction_name)
+        try:
+            data = torch.load(path)
+            for dataset in data:
+                name, d = dataset
+                predictions, labels, uid = d
+                if name not in all_predictions:
+                    all_predictions[name] = np.array(predictions)
+                    if args.labels is None:
+                        args.labels = [i for i in range(all_predictions[name].shape[1])]
+                    if args.eval:
+                        all_labels[name] = np.array(labels)
+                    all_uid[name] = np.array(uid)
+                else:
+                    all_predictions[name] += np.array(predictions)
+                    assert np.allclose(all_uid[name], np.array(uid))
+        except Exception as e:
+            print(e)
+            continue
+    return all_predictions, all_labels, all_uid
+
+
+def get_threshold(all_predictions, all_labels, one_threshold=False):
+    if one_threshold:
+        all_predictons = {'combined': np.concatenate(list(all_predictions.values()))}
+        all_labels = {'combined': np.concatenate(list(all_predictions.labels()))}
+    out_thresh = []
+    for dataset in all_predictions:
+        preds = all_predictions[dataset]
+        labels = all_labels[dataset]
+        out_thresh.append(calc_threshold(preds, labels))
+    return out_thresh
+
+
+def calc_threshold(p, l):
+    trials = [(i) * (1. / 100.) for i in range(100)]
+    best_acc = float('-inf')
+    best_thresh = 0
+    for t in trials:
+        acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean()
+        if acc > best_acc:
+            best_acc = acc
+            best_thresh = t
+    return best_thresh
+
+
+def apply_threshold(preds, t):
+    assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0])))
+    prob = preds[:, -1]
+    thresholded = (prob >= t).astype(int)
+    preds = np.zeros_like(preds)
+    preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1
+    return preds
+
+
+def threshold_predictions(all_predictions, threshold):
+    if len(threshold) != len(all_predictions):
+        threshold = [threshold[-1]] * (len(all_predictions) - len(threshold))
+    for i, dataset in enumerate(all_predictions):
+        thresh = threshold[i]
+        preds = all_predictions[dataset]
+        all_predictions[dataset] = apply_threshold(preds, thresh)
+    return all_predictions
+
+
+def postprocess_predictions(all_predictions, all_labels, args):
+    for d in all_predictions:
+        all_predictions[d] = all_predictions[d] / len(args.paths)
+
+    if args.calc_threshold:
+        args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold)
+        print('threshold', args.threshold)
+
+    if args.threshold is not None:
+        all_predictions = threshold_predictions(all_predictions, args.threshold)
+
+    return all_predictions, all_labels
+
+
+def write_predictions(all_predictions, all_labels, all_uid, args):
+    all_correct = 0
+    count = 0
+    for dataset in all_predictions:
+        preds = all_predictions[dataset]
+        preds = np.argmax(preds, -1)
+        if args.eval:
+            correct = (preds == all_labels[dataset]).sum()
+            num = len(all_labels[dataset])
+            accuracy = correct / num
+            count += num
+            all_correct += correct
+            accuracy = (preds == all_labels[dataset]).mean()
+            print(accuracy)
+        if not os.path.exists(os.path.join(args.outdir, dataset)):
+            os.makedirs(os.path.join(args.outdir, dataset))
+        outpath = os.path.join(
+            args.outdir, dataset, os.path.splitext(
+                args.prediction_name)[0] + '.tsv')
+        with open(outpath, 'w') as f:
+            f.write('id\tlabel\n')
+            f.write('\n'.join(str(uid) + '\t' + str(args.labels[p])
+                              for uid, p in zip(all_uid[dataset], preds.tolist())))
+    if args.eval:
+        print(all_correct / count)
+
+
+def ensemble_predictions(args):
+    all_predictions, all_labels, all_uid = process_files(args)
+    all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args)
+    write_predictions(all_predictions, all_labels, all_uid, args)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--paths', required=True, nargs='+',
+                        help='paths to checkpoint directories used in ensemble')
+    parser.add_argument('--eval', action='store_true',
+                        help='compute accuracy metrics against labels (dev set)')
+    parser.add_argument('--outdir',
+                        help='directory to place ensembled predictions in')
+    parser.add_argument('--prediction-name', default='test_predictions.pt',
+                        help='name of predictions in checkpoint directories')
+    parser.add_argument('--calc-threshold', action='store_true',
+                        help='calculate threshold classification')
+    parser.add_argument('--one-threshold', action='store_true',
+                        help='use on threshold for all subdatasets')
+    parser.add_argument('--threshold', nargs='+', default=None, type=float,
+                        help='user supplied threshold for classification')
+    parser.add_argument('--labels', nargs='+', default=None,
+                        help='whitespace separated list of label names')
+    args = parser.parse_args()
+    ensemble_predictions(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py
new file mode 100644
index 00000000..c89ea2cb
--- /dev/null
+++ b/tasks/eval_utils.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluation utilities."""
+
+import os
+import time
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import mpu
+from tasks.finetune_utils import build_data_loader
+from tasks.finetune_utils import process_batch
+
+
+def accuracy_func_provider(single_dataset_provider):
+    """Provide function that calculates accuracies."""
+    args = get_args()
+
+    # Build dataloaders.
+    datapaths = args.valid_data
+    dataloaders = []
+    for datapath in datapaths:
+        dataset = single_dataset_provider(datapath)
+        dataloader = build_data_loader(
+            dataset, args.batch_size, num_workers=args.num_workers,
+            drop_last=(mpu.get_data_parallel_world_size() > 1))
+        dataloaders.append((dataset.dataset_name, dataloader))
+
+    def metrics_func(model, epoch, output_predictions=False):
+        print_rank_0('calculating metrics ...')
+        correct = 0
+        total = 0
+        if output_predictions:
+            assert mpu.get_data_parallel_world_size() == 1
+            named_predictions = []
+            names = 'predictions'
+        for name, dataloader in dataloaders:
+            output = calculate_correct_answers(name, model, dataloader,
+                                               epoch, output_predictions)
+            if not output_predictions:
+                correct_ans, total_count = output
+            else:
+                correct_ans, total_count, predictions = output
+                named_predictions.append((name, predictions))
+                names += '_' + name
+            correct += correct_ans
+            total += total_count
+        percent = float(correct) * 100.0 / float(total)
+        print_rank_0(' >> |epoch: {}| overall: correct / total = {} / {} = '
+                     '{:.4f} %'.format(epoch, correct, total, percent))
+
+        if output_predictions and torch.distributed.get_rank() == 0:
+            assert args.load is not None
+            filename = os.path.join(args.load, names + '.pt')
+            torch.save(named_predictions, filename)
+
+    return metrics_func
+
+
+def calculate_correct_answers(name, model, dataloader,
+                              epoch, output_predictions):
+    """Calculate correct over total answers and return prediction if the
+    `output_predictions` is true."""
+
+    start_time = time.time()
+    model.eval()
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        total = 0
+        correct = 0
+        if output_predictions:
+            # This option is only possible when data parallel size is 1.
+            assert mpu.get_data_parallel_world_size() == 1
+            softmaxes = []
+            labels = []
+            ids = []
+        for _, batch in enumerate(dataloader):
+            # Run the model forward.
+            tokens, types, labels_, attention_mask = process_batch(batch)
+            logits = model(tokens, attention_mask, types)
+            # Add output predictions.
+            if output_predictions:
+                softmaxes.extend(torch.nn.Softmax(dim=-1)(
+                    logits.float()).data.cpu().numpy().tolist())
+                labels.extend(labels_.data.cpu().numpy().tolist())
+                ids.extend(batch['uid'].cpu().numpy().tolist())
+            # Compute the correct answers.
+            predicted = torch.argmax(logits, dim=-1)
+            corrects = (predicted == labels_)
+            # Add to the counters.
+            total += labels_.size(0)
+            correct += corrects.sum().item()
+    model.train()
+
+    # Reduce.
+    unreduced = torch.cuda.LongTensor([correct, total])
+    torch.distributed.all_reduce(unreduced,
+                                 group=mpu.get_data_parallel_group())
+
+    # Print on screen.
+    correct_ans = unreduced[0].item()
+    total_count = unreduced[1].item()
+    percent = float(correct_ans) * 100.0 / float(total_count)
+    elapsed_time = time.time() - start_time
+    print_rank_0(' > |epoch: {}| metrics for {}: correct / total '
+                 '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format(
+                     epoch, name, correct_ans, total_count,
+                     percent, elapsed_time))
+
+    if output_predictions:
+        return correct_ans, total_count, (softmaxes, labels, ids)
+    return correct_ans, total_count
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
new file mode 100644
index 00000000..fc813f47
--- /dev/null
+++ b/tasks/finetune_utils.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Finetune utilities."""
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.checkpointing import save_checkpoint
+from megatron.training import evaluate_and_print_results
+from megatron.training import setup_model_and_optimizer
+from megatron.training import train_step
+from megatron.training import training_log
+from megatron.utils import check_adlr_autoresume_termination
+from megatron.utils import reduce_losses
+
+
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+
+    tokens = batch['text'].long().cuda().contiguous()
+    types = batch['types'].long().cuda().contiguous()
+    labels = batch['label'].long().cuda().contiguous()
+    attention_mask = batch['padding_mask'].float().cuda().contiguous()
+    if args.fp16:
+        attention_mask = attention_mask.half()
+
+    return tokens, types, labels, attention_mask
+
+
+def _cross_entropy_forward_step(batch, model):
+    """Simple forward step with cross-entropy loss."""
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch generator').start()
+    try:
+        batch_ = next(batch)
+    except BaseException:
+        batch_ = batch
+    tokens, types, labels, attention_mask = process_batch(batch_)
+    timers('batch generator').stop()
+
+    # Forward model.
+    logits = model(tokens, attention_mask, types)
+
+    # Cross-entropy loss.
+    loss_func = torch.nn.CrossEntropyLoss()
+    loss = loss_func(logits.contiguous().float(), labels)
+
+    # Reduce loss for logging.
+    reduced_loss = reduce_losses([loss])
+
+    return loss, {'lm loss': reduced_loss[0]}
+
+
+def build_data_loader(dataset, batch_size, num_workers, drop_last):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+
+    # Sampler.
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank)
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=batch_size,
+                                              sampler=sampler,
+                                              shuffle=False,
+                                              num_workers=num_workers,
+                                              drop_last=drop_last,
+                                              pin_memory=True)
+
+    return data_loader
+
+
+def _build_infinite_size_dataloader(dataloader):
+    """Build a looped dataloader with infinite size."""
+
+    iterator = dataloader.__iter__()
+    while True:
+        try:
+            yield iterator.__next__()
+        except StopIteration:
+            iterator = dataloader.__iter__()
+
+
+def _build_train_valid_dataloaders(train_dataset, valid_dataset):
+    """Traing and validation dataloaders."""
+    args = get_args()
+
+    print_rank_0('building train and validation dataloaders ...')
+    # Training dataset.
+    train_dataloader = build_data_loader(train_dataset, args.batch_size,
+                                         args.num_workers, not args.keep_last)
+    # Set the training iterations.
+    args.train_iters_per_epoch = len(train_dataloader)
+    args.train_iters = args.epochs * args.train_iters_per_epoch
+    # Validation dataset. For this dataset, we do not need to set up
+    # shuffling so we can just use a simple infinite loop.
+    valid_dataloader_ = build_data_loader(valid_dataset, args.batch_size,
+                                          args.num_workers, not args.keep_last)
+    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
+
+    return train_dataloader, valid_dataloader
+
+
+def _train(model, optimizer, lr_scheduler, forward_step,
+           train_dataloader, valid_dataloader, end_of_epoch_callback):
+    """Train the model."""
+    args = get_args()
+    timers = get_timers()
+
+    # Turn on training mode which enables dropout.
+    model.train()
+
+    # Tracking loss.
+    losses_dict_sum = {}
+
+    # Starting epoch and iteration
+    start_epoch = args.iteration // args.train_iters_per_epoch
+    start_iteration = args.iteration % args.train_iters_per_epoch
+    iteration = args.iteration
+
+    # Memory reporting flag.
+    report_memory_flag = True
+
+    # For each remaining epoch
+    timers('interval time').start()
+    for epoch in range(start_epoch, args.epochs):
+        print_rank_0('working on epoch {} ...'.format(epoch + 1))
+
+        # Set the data loader epoch to shuffle the index iterator.
+        train_dataloader.sampler.set_epoch(args.seed + epoch)
+
+        # For all the batches in the dataset.
+        for iteration_, batch in enumerate(train_dataloader):
+
+            # Ignore the iterations before starting value
+            if iteration_ < start_iteration:
+                continue
+            # Set to zero so the next epoch does not skip any batches.
+            start_iteration = 0
+
+            # Train for one step.
+            losses_dict, _ = train_step(forward_step, batch, model,
+                                        optimizer, lr_scheduler)
+            iteration += 1
+
+            # Logging.
+            report_memory_flag = training_log(losses_dict, losses_dict_sum,
+                                              optimizer.param_groups[0]['lr'],
+                                              iteration, optimizer.loss_scale,
+                                              report_memory_flag)
+
+            # Autoresume
+            if args.adlr_autoresume and \
+               (iteration % args.adlr_autoresume_interval == 0):
+                check_adlr_autoresume_termination(iteration, model,
+                                                  optimizer, lr_scheduler)
+
+            # Checkpointing
+            if args.save and args.save_interval and \
+               iteration % args.save_interval == 0:
+                save_checkpoint(iteration, model, optimizer, lr_scheduler)
+
+            # Evaluation
+            if args.eval_interval and iteration % args.eval_interval == 0:
+                prefix = 'iteration {}'.format(iteration)
+                evaluate_and_print_results(prefix, forward_step,
+                                           valid_dataloader, model,
+                                           iteration, False)
+
+        # Checkpointing at the end of each epoch.
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, lr_scheduler)
+
+        # Callback at the end of each epoch.
+        if end_of_epoch_callback is not None:
+            end_of_epoch_callback(model, epoch)
+
+
+def finetune(train_valid_datasets_provider, model_provider,
+             forward_step=_cross_entropy_forward_step,
+             end_of_epoch_callback_provider=None):
+    """Main finetune function used across all tasks."""
+    args = get_args()
+    timers = get_timers()
+
+    # Train and validation data loaders.
+    timers('train/valid/test dataset/dataloder').start()
+    if args.epochs > 0:
+        train_dataset, valid_dataset = train_valid_datasets_provider()
+        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
+            train_dataset, valid_dataset)
+    timers('train/valid/test dataset/dataloder').stop()
+
+    # Build calback function.
+    timers('callback function').start()
+    end_of_epoch_callback = None
+    if end_of_epoch_callback_provider is not None:
+        end_of_epoch_callback = end_of_epoch_callback_provider()
+    timers('callback function').stop()
+
+    # Build model, optimizer and learning rate scheduler.
+    timers('model and optimizer').start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    timers('model and optimizer').stop()
+
+    # If pretrained checkpoint is provided and we have not trained for
+    # any iteration (i.e., iteration is zero), then load the pretrained
+    # checkpoint.
+    timers('pretrained checkpoint').start()
+    if args.iteration == 0 and args.pretrained_checkpoint is not None:
+        original_load = args.load
+        args.load = args.pretrained_checkpoint
+        _ = load_checkpoint(model, None, None)
+        args.load = original_load
+        # This is critical when only model is loaded. We should make sure
+        # master parameters are also updated.
+        if args.fp16:
+            optimizer._model_params_to_master_params()
+    timers('pretrained checkpoint').stop()
+
+    # Print setup timing.
+    print_rank_0('done with setups ...')
+    timers.log(['train/valid/test dataset/dataloder', 'callback function',
+                'model and optimizer', 'pretrained checkpoint'])
+    print_rank_0('training ...')
+
+    # Finetune the model.
+    if args.epochs > 0:
+        _train(model, optimizer, lr_scheduler, forward_step,
+               train_dataloader, valid_dataloader, end_of_epoch_callback)
+    # Or just evaluate.
+    else:
+        if end_of_epoch_callback is not None:
+            print_rank_0('evaluation only mode, setting epoch to -1')
+            end_of_epoch_callback(model, epoch=-1, output_predictions=True)
+
+    print_rank_0('done :-)')
diff --git a/tasks/glue/data.py b/tasks/glue/data.py
new file mode 100644
index 00000000..357ad130
--- /dev/null
+++ b/tasks/glue/data.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GLUE dataset."""
+
+from abc import ABC
+from abc import abstractmethod
+
+from torch.utils.data import Dataset
+
+from megatron import print_rank_0
+from tasks.data_utils import build_sample
+from tasks.data_utils import build_tokens_types_paddings_from_text
+
+
+class GLUEAbstractDataset(ABC, Dataset):
+    """GLUE base dataset class."""
+
+    def __init__(self, task_name, dataset_name, datapaths,
+                 tokenizer, max_seq_length):
+        # Store inputs.
+        self.task_name = task_name
+        self.dataset_name = dataset_name
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
+                                                             self.dataset_name))
+        # Process the files.
+        string = '  > paths:'
+        for path in datapaths:
+            string += ' ' + path
+        print_rank_0(string)
+        self.samples = []
+        for datapath in datapaths:
+            self.samples.extend(self.process_samples_from_single_path(datapath))
+        print_rank_0('  >> total number of samples: {}'.format(
+            len(self.samples)))
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        raw_sample = self.samples[idx]
+        ids, types, paddings = build_tokens_types_paddings_from_text(
+            raw_sample['text_a'], raw_sample['text_b'],
+            self.tokenizer, self.max_seq_length)
+        sample = build_sample(ids, types, paddings,
+                              raw_sample['label'], raw_sample['uid'])
+        return sample
+
+    @abstractmethod
+    def process_samples_from_single_path(self, datapath):
+        """Abstract method that takes a single path / filename and
+        returns a list of dataset samples, each sample being a dict of
+            {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
+        """
+        pass
diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py
new file mode 100644
index 00000000..631f7efb
--- /dev/null
+++ b/tasks/glue/finetune.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GLUE finetuning/evaluation."""
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron.model.classification import Classification
+from tasks.eval_utils import accuracy_func_provider
+from tasks.finetune_utils import finetune
+
+
+def glue_classification(num_classes, Dataset,
+                        name_from_datapath_func):
+
+    def train_valid_datasets_provider():
+        """Build train and validation dataset."""
+        args = get_args()
+        tokenizer = get_tokenizer()
+
+        train_dataset = Dataset('training', args.train_data,
+                                tokenizer, args.seq_length)
+        valid_dataset = Dataset('validation', args.valid_data,
+                                tokenizer, args.seq_length)
+
+        return train_dataset, valid_dataset
+
+    def model_provider():
+        """Build the model."""
+        args = get_args()
+
+        print_rank_0('building classification model for {} ...'.format(
+            args.task))
+
+        return Classification(num_classes=num_classes, num_tokentypes=2)
+
+    def metrics_func_provider():
+        """Privde metrics callback function."""
+        def single_dataset_provider(datapath):
+            args = get_args()
+            tokenizer = get_tokenizer()
+
+            name = name_from_datapath_func(datapath)
+            return Dataset(name, [datapath], tokenizer, args.seq_length)
+        return accuracy_func_provider(single_dataset_provider)
+
+    """Finetune/evaluate."""
+    finetune(train_valid_datasets_provider, model_provider,
+             end_of_epoch_callback_provider=metrics_func_provider)
+
+
+def main():
+    args = get_args()
+
+    if args.task == 'MNLI':
+
+        num_classes = 3
+        from tasks.glue.mnli import MNLIDataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('MNLI')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+
+    elif args.task == 'QQP':
+
+        num_classes = 2
+        from tasks.glue.qqp import QQPDataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('QQP')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+
+    else:
+        raise NotImplementedError('GLUE task {} is not implemented.'.format(
+            args.task))
+
+    glue_classification(num_classes, Dataset, name_from_datapath)
diff --git a/tasks/glue/mnli.py b/tasks/glue/mnli.py
new file mode 100644
index 00000000..547a2a00
--- /dev/null
+++ b/tasks/glue/mnli.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MNLI dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
+
+
+class MNLIDataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label='contradiction'):
+        self.test_label = test_label
+        super().__init__('MNLI', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if len(row) == 10:
+                        is_test = True
+                        print_rank_0(
+                            '   reading {}, {} and {} columns and setting '
+                            'labels to {}'.format(
+                                row[0].strip(), row[8].strip(),
+                                row[9].strip(), self.test_label))
+                    else:
+                        print_rank_0('    reading {} , {}, {}, and {} columns '
+                                     '...'.format(
+                                         row[0].strip(), row[8].strip(),
+                                         row[9].strip(), row[-1].strip()))
+                    continue
+
+                text_a = clean_text(row[8].strip())
+                text_b = clean_text(row[9].strip())
+                unique_id = int(row[0].strip())
+                label = row[-1].strip()
+                if is_test:
+                    label = self.test_label
+
+                assert len(text_a) > 0
+                assert len(text_b) > 0
+                assert label in LABELS
+                assert unique_id >= 0
+
+                sample = {'text_a': text_a,
+                          'text_b': text_b,
+                          'label': LABELS[label],
+                          'uid': unique_id}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/glue/qqp.py b/tasks/glue/qqp.py
new file mode 100644
index 00000000..a6adbd09
--- /dev/null
+++ b/tasks/glue/qqp.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""QQP dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = [0, 1]
+
+
+class QQPDataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label=0):
+        self.test_label = test_label
+        super().__init__('QQP', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if len(row) == 3:
+                        is_test = True
+                        print_rank_0('   reading {}, {}, and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         row[2].strip(), self.test_label))
+                    else:
+                        assert len(row) == 6
+                        print_rank_0('    reading {}, {}, {}, and {} columns'
+                                     ' ...'.format(
+                                         row[0].strip(), row[3].strip(),
+                                         row[4].strip(), row[5].strip()))
+                    continue
+
+                if is_test:
+                    assert len(row) == 3, 'expected length 3: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[1].strip())
+                    text_b = clean_text(row[2].strip())
+                    label = self.test_label
+                    assert len(text_a) > 0
+                    assert len(text_b) > 0
+                else:
+                    if len(row) == 6:
+                        uid = int(row[0].strip())
+                        text_a = clean_text(row[3].strip())
+                        text_b = clean_text(row[4].strip())
+                        label = int(row[5].strip())
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_b) == 0:
+                        print_rank_0('***WARNING*** zero length b, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert label in LABELS
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': label}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/main.py b/tasks/main.py
new file mode 100644
index 00000000..d8a30d14
--- /dev/null
+++ b/tasks/main.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Main tasks functionality."""
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+
+from megatron import get_args
+from megatron.initialize import initialize_megatron
+
+
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='tasks')
+
+    group.add_argument('--task', type=str, required=True,
+                       help='Task name.')
+    group.add_argument('--epochs', type=int, default=None,
+                       help='Number of finetunning epochs. Zero results in '
+                       'evaluation only.')
+    group.add_argument('--pretrained-checkpoint', type=str, default=None,
+                       help='Pretrained checkpoint used for finetunning.')
+    group.add_argument('--keep-last', action='store_true',
+                       help='Keep the last batch (maybe incomplete) in'
+                       'the data loader')
+    group.add_argument('--train-data', nargs='+', default=None,
+                       help='Whitespace separated paths or corpora names '
+                       'for training.')
+    group.add_argument('--valid-data', nargs='*', default=None,
+                       help='path(s) to the validation data.')
+    group.add_argument('--overlapping-eval', type=int, default=32,
+                       help='Sliding window for overlapping evaluation.')
+    group.add_argument('--strict-lambada', action='store_true',
+                       help='Use more difficult formulation of lambada.')
+
+    return parser
+
+
+if __name__ == '__main__':
+
+    initialize_megatron(extra_args_provider=get_tasks_args)
+
+    args = get_args()
+    if args.task == 'RACE':
+        from race.finetune import main
+    elif args.task in ['MNLI', 'QQP']:
+        from glue.finetune import main
+    elif args.task in ['LAMBADA', 'WIKITEXT103']:
+        from zeroshot_gpt2.evaluate import main
+    else:
+        raise NotImplementedError('Task {} is not implemented.'.format(
+            args.task))
+
+    main()
diff --git a/tasks/race/data.py b/tasks/race/data.py
new file mode 100644
index 00000000..f11cad61
--- /dev/null
+++ b/tasks/race/data.py
@@ -0,0 +1,131 @@
+
+import glob
+import json
+import os
+import time
+
+from torch.utils.data import Dataset
+
+from megatron import print_rank_0
+from tasks.data_utils import build_sample
+from tasks.data_utils import build_tokens_types_paddings_from_ids
+from tasks.data_utils import clean_text
+
+
+NUM_CHOICES = 4
+MAX_QA_LENGTH = 128
+
+
+class RaceDataset(Dataset):
+
+    def __init__(self, dataset_name, datapaths, tokenizer, max_seq_length,
+                 max_qa_length=MAX_QA_LENGTH):
+
+        self.dataset_name = dataset_name
+        print_rank_0(' > building RACE dataset for {}:'.format(
+            self.dataset_name))
+
+        string = '  > paths:'
+        for path in datapaths:
+            string += ' ' + path
+        print_rank_0(string)
+
+        self.samples = []
+        for datapath in datapaths:
+            self.samples.extend(process_single_datapath(datapath, tokenizer,
+                                                        max_qa_length,
+                                                        max_seq_length))
+
+        print_rank_0('  >> total number of samples: {}'.format(
+            len(self.samples)))
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        return self.samples[idx]
+
+
+def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length):
+    """Read in RACE files, combine, clean-up, tokenize, and convert to
+    samples."""
+
+    print_rank_0('   > working on {}'.format(datapath))
+    start_time = time.time()
+
+    # Get list of files.
+    filenames = glob.glob(os.path.join(datapath, '*.txt'))
+
+    samples = []
+    num_docs = 0
+    num_questions = 0
+    num_samples = 0
+    # Load all the files
+    for filename in filenames:
+        with open(filename, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                num_docs += 1
+
+                context = data["article"]
+                questions = data["questions"]
+                choices = data["options"]
+                answers = data["answers"]
+                # Check the length.
+                assert len(questions) == len(answers)
+                assert len(questions) == len(choices)
+
+                # Context: clean up and convert to ids.
+                context = clean_text(context)
+                context_ids = tokenizer.tokenize(context)
+
+                # Loop over questions.
+                for qi, question in enumerate(questions):
+                    num_questions += 1
+                    # Label.
+                    label = ord(answers[qi]) - ord("A")
+                    assert label >= 0
+                    assert label < NUM_CHOICES
+                    assert len(choices[qi]) == NUM_CHOICES
+
+                    # For each question, build num-choices samples.
+                    ids_list = []
+                    types_list = []
+                    paddings_list = []
+                    for ci in range(NUM_CHOICES):
+                        choice = choices[qi][ci]
+                        # Merge with choice.
+                        if "_" in question:
+                            qa = question.replace("_", choice)
+                        else:
+                            qa = " ".join([question, choice])
+                        # Clean QA.
+                        qa = clean_text(qa)
+                        # Tokenize.
+                        qa_ids = tokenizer.tokenize(qa)
+                        # Trim if needed.
+                        if len(qa_ids) > max_qa_length:
+                            qa_ids = qa_ids[0:max_qa_length]
+
+                        # Build the sample.
+                        ids, types, paddings \
+                            = build_tokens_types_paddings_from_ids(
+                                qa_ids, context_ids, max_seq_length,
+                                tokenizer.cls, tokenizer.sep, tokenizer.pad)
+
+                        ids_list.append(ids)
+                        types_list.append(types)
+                        paddings_list.append(paddings)
+
+                    # Convert to numpy and add to samples
+                    samples.append(build_sample(ids_list, types_list,
+                                                paddings_list, label,
+                                                num_samples))
+                    num_samples += 1
+
+    elapsed_time = time.time() - start_time
+    print_rank_0('    > processed {} document, {} questions, and {} samples'
+                 ' in {:.2f} seconds'.format(num_docs, num_questions,
+                                             num_samples, elapsed_time))
+
+    return samples
diff --git a/tasks/race/finetune.py b/tasks/race/finetune.py
new file mode 100644
index 00000000..c7bc53ea
--- /dev/null
+++ b/tasks/race/finetune.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Race."""
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron.model.multiple_choice import MultipleChoice
+from tasks.eval_utils import accuracy_func_provider
+from tasks.finetune_utils import finetune
+from tasks.race.data import RaceDataset
+
+
+def train_valid_datasets_provider():
+    """Provide train and validation datasets."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    train_dataset = RaceDataset('training', args.train_data,
+                                tokenizer, args.seq_length)
+    valid_dataset = RaceDataset('validation', args.valid_data,
+                                tokenizer, args.seq_length)
+
+    return train_dataset, valid_dataset
+
+
+def model_provider():
+    """Build the model."""
+
+    print_rank_0('building multichoice model for RACE ...')
+
+    return MultipleChoice(num_tokentypes=2)
+
+
+def metrics_func_provider():
+    """Privde metrics callback function."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    def single_dataset_provider(datapath):
+        name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
+        return RaceDataset(name, [datapath], tokenizer, args.seq_length)
+
+    return accuracy_func_provider(single_dataset_provider)
+
+
+def main():
+
+    finetune(train_valid_datasets_provider, model_provider,
+             end_of_epoch_callback_provider=metrics_func_provider)
diff --git a/tasks/zeroshot_gpt2/datasets.py b/tasks/zeroshot_gpt2/datasets.py
new file mode 100644
index 00000000..0d1f0375
--- /dev/null
+++ b/tasks/zeroshot_gpt2/datasets.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Zero-shot datasets."""
+
+import json
+import math
+
+import numpy as np
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from .detokenizer import get_detokenizer
+
+
+def build_dataset(task):
+    """Helper function to select and build dataset."""
+
+    if task == 'LAMBADA':
+        return _build_lambada_dataset()
+    if task == 'WIKITEXT103':
+        return _build_wikitext103_dataset()
+
+    raise NotImplementedError('dataset for {} task is not '
+                              'implemented.'.format(task))
+
+
+class _LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, tokens, seq_len, pad_idx, num_original_tokens,
+                 num_tokenized_tokens, overalapping_eval=None):
+        self.tokens = tokens
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.overalapping_eval = overalapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        self.total_targets = len(self.tokens) - 1
+        # remove first sequence tokens
+        targets = max(self.total_targets - self.overalapping_eval, 0)
+        self.total_sequences = max(
+            math.ceil(targets / self.overalapping_eval) + 1, 1)
+
+    def __len__(self):
+        return self.total_sequences
+
+    def __getitem__(self, idx):
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.seq_len
+        tokens = self.tokens[start_idx:end_idx + 1]
+        num_tokens = len(tokens)
+        pad_mask = [1] * num_tokens
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            pad_mask += [0] * (num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+        if self.overalapping_eval != self.seq_len and idx != 0:
+            pad_mask[:-self.overalapping_eval] *= 0
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+class _LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
+        print_rank_0('> building lambada dataset from {} ...'.format(path))
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.tokenizer = tokenizer
+        self.strict = strict
+
+        self.tokens = []
+        self.labels = []
+        with open(path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.tokenize(text)
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
+        last_token = self.tokenizer.tokenize(' ' + last_token)
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens = self.tokens[idx]
+        num_tokens = len(tokens)
+        pad_mask = [0] * num_tokens
+        labels = self.labels[idx]
+        pad_mask += [1] * len(labels)
+        tokens = tokens + labels
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            pad_mask += [0] * (num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+def _build_lambada_dataset():
+    """Build lambada dataset."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    val_dataset = _LambadaDataset(args.valid_data[0], tokenizer.eod, tokenizer,
+                                  args.seq_length, args.strict_lambada)
+    print_rank_0(' > found {} samples.'.format(len(val_dataset)))
+
+    return val_dataset
+
+
+def _build_wikitext103_dataset():
+    """"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data[0], "rb") as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(" "))
+    entire_data = get_detokenizer(args.valid_data[0])(entire_data)
+    tokenized_data = tokenizer.tokenize(entire_data)
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod,
+                             num_original_tokens, num_tokenized_tokens,
+                             args.overlapping_eval)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens, num_tokenized_tokens))
+
+    return val_dataset
diff --git a/tasks/zeroshot_gpt2/detokenizer.py b/tasks/zeroshot_gpt2/detokenizer.py
new file mode 100644
index 00000000..2bc87286
--- /dev/null
+++ b/tasks/zeroshot_gpt2/detokenizer.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Detokenization."""
+
+import re
+
+
+def ptb_detokenizer(string):
+    string = string.replace(" '", "'")
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" n't", "n't")
+    string = string.replace(" N ", "1 ")
+    string = string.replace("$ 1", "$1")
+    string = string.replace("# 1", "#1")
+    return string
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def lambada_detokenizer(string):
+    return string
+
+
+_DETOKENIZERS = {
+    'ptb': ptb_detokenizer,
+    'wiki': wikitext_detokenizer,
+    'lambada': lambada_detokenizer,
+}
+
+
+def get_detokenizer(path):
+    for key in _DETOKENIZERS.keys():
+        if key in path:
+            return _DETOKENIZERS[key]
diff --git a/tasks/zeroshot_gpt2/evaluate.py b/tasks/zeroshot_gpt2/evaluate.py
new file mode 100644
index 00000000..b1c06d20
--- /dev/null
+++ b/tasks/zeroshot_gpt2/evaluate.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT2 zero-shot evaluation."""
+
+import math
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.model import GPT2Model
+from megatron.training import get_model
+from megatron.utils import get_ltor_masks_and_position_ids
+from tasks.finetune_utils import build_data_loader
+
+from .datasets import build_dataset
+
+
+def get_model_provider(eval_metric):
+    """Based on evaluation metric set the parallel-output flag and
+    return the model provider."""
+
+    def model_provider():
+        """Build the model."""
+
+        if eval_metric == 'loss':
+            parallel_output = True
+        elif eval_metric == 'accuracy':
+            parallel_output = False
+        else:
+            raise NotImplementedError('output type for {} evaluation metric '
+                                      'is not supported.'.format(eval_metric))
+
+        print_rank_0('building GPT2 model ...')
+        model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output)
+
+        return model
+
+    return model_provider
+
+
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
+    tokens_ = batch['text'].long().cuda().contiguous()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, attention_mask, position_ids, loss_mask
+
+
+def forward_step(batch, model, eval_metric):
+    """Forward step."""
+
+    # Get the batch.
+    tokens, labels, attention_mask, position_ids, loss_mask = process_batch(
+        batch)
+
+    # Forward model.
+    output = model(tokens, position_ids, attention_mask)
+
+    # For loss, return the unreduced loss.
+    if eval_metric == 'loss':
+        losses = mpu.vocab_parallel_cross_entropy(
+            output.contiguous().float(), labels.contiguous())
+        loss = torch.sum(
+            losses.view(-1) * loss_mask.contiguous().view(-1).float())
+        return loss
+
+    # For accuracy, return the number of correctly predicted samples.
+    if eval_metric == 'accuracy':
+        outputs = torch.argmax(output, -1)
+        correct = (outputs == labels).float()
+        correct[(1 - loss_mask).bool()] = 1
+        correct = correct.prod(-1)
+        return correct.sum()
+
+    raise NotImplementedError('forward method for evaluation metric {} '
+                              'is not implemented.'.format(eval_metric))
+
+
+def evaluate(data_loader, model, eval_metric):
+    """Evaluation."""
+    args = get_args()
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_output = 0.0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(data_loader):
+            if iteration % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output = forward_step(batch, model, eval_metric)
+
+            # Reduce across processes.
+            torch.distributed.all_reduce(output,
+                                         group=mpu.get_data_parallel_group())
+
+            total_output += output
+
+    return total_output
+
+
+def evaluate_and_print_results(task, data_loader, model, eval_metric):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output = evaluate(data_loader, model, eval_metric)
+
+    string = ' validation results on {} | '.format(task)
+    if eval_metric == 'loss':
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+
+    elif eval_metric == 'accuracy':
+        num_examples = len(data_loader.dataset)
+        acc = output / num_examples
+        string += 'number correct: {:.4E} | '.format(output)
+        string += 'total examples: {:.4E} | '.format(num_examples)
+        string += 'avg accuracy: {:.4E}'.format(acc)
+
+    else:
+        raise NotImplementedError('evaluation method for {} metric is not '
+                                  'implemented yet.'.format(eval_metric))
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+
+
+def main():
+    """Main program."""
+    args = get_args()
+
+    if args.task == 'LAMBADA':
+        eval_metric = 'accuracy'
+    elif args.task == 'WIKITEXT103':
+        eval_metric = 'loss'
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+
+    # Set up model and load checkpoint.
+    model = get_model(get_model_provider(eval_metric))
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    # Data stuff.
+    dataset = build_dataset(args.task)
+    dataloader = build_data_loader(dataset, args.batch_size,
+                                   args.num_workers, drop_last=False)
+
+    # Run evaluation.
+    evaluate_and_print_results(args.task, dataloader, model, eval_metric)
+
+    print_rank_0('done :-)')
diff --git a/tools/create_doc_index.py b/tools/create_doc_index.py
new file mode 100644
index 00000000..1e14d1dd
--- /dev/null
+++ b/tools/create_doc_index.py
@@ -0,0 +1,30 @@
+import sys
+sys.path.append('../')
+
+from megatron.indexer import IndexBuilder
+from megatron.initialize import initialize_megatron
+
+
+def main():
+    """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
+    - Include all args needed for initial model specification
+
+    Other key args:
+        --block-data-path: path to write to
+        --ict-load or --realm-load: path to checkpoint with which to embed
+        --data-path and --titles-data-path: paths for dataset
+        --indexer-log-interval: reporting interval
+        --indexer-batch-size: size specific for indexer jobs
+
+    Check README.md for example script
+    """
+
+    initialize_megatron(extra_args_provider=None,
+                        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+    index_builder = IndexBuilder()
+    index_builder.build_and_save_index()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tools/generate_samples_gpt2.py b/tools/generate_samples_gpt2.py
new file mode 100644
index 00000000..c79b65f9
--- /dev/null
+++ b/tools/generate_samples_gpt2.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample Generate GPT2"""
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron.checkpointing import load_checkpoint
+from megatron.initialize import initialize_megatron
+from megatron.model import GPT2Model
+from megatron.training import get_model
+from megatron.text_generation_utils import generate_and_write_samples_unconditional
+from megatron.text_generation_utils import generate_samples_input_from_file
+from megatron.text_generation_utils import generate_samples_interactive
+
+
+def model_provider():
+    """Build the model."""
+
+    print_rank_0('building GPT2 model ...')
+    model = GPT2Model(num_tokentypes=0, parallel_output=False)
+
+    return model
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--greedy", action='store_true', default=False,
+                       help='Use greedy sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0,
+                       help='Top k sampling.')
+    group.add_argument("--out-seq-length", type=int, default=1024,
+                       help='Size of the output generated text.')
+    group.add_argument("--sample-input-file", type=str, default=None,
+                       help='Get input from file instead of interactive mode, '
+                       'each line is an input.')
+    group.add_argument("--sample-output-file", type=str, default=None,
+                       help='Output file got from --sample-input-file')
+    group.add_argument("--num-samples", type=int, default=0,
+                       help='Number of samples to generate unconditionally, '
+                       'defaults to 0 and interactive conditional sampling')
+    group.add_argument("--genfile", type=str,
+                       help='Output file when generating unconditionally')
+    group.add_argument("--recompute", action='store_true',
+                       help='During generation recompute all attention '
+                       'instead of using previously computed keys/values.')
+
+    return parser
+
+
+def main():
+    """Main program."""
+
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider)
+    args = get_args()
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    # Generate samples.
+    if args.num_samples == 0:
+        args.batch_size = 1
+        if args.sample_input_file != None:
+            generate_samples_input_from_file(model)
+        else:
+            generate_samples_interactive(model)
+    else:
+        generate_and_write_samples_unconditional(model)
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/tools/linter.py b/tools/linter.py
new file mode 100644
index 00000000..5b140076
--- /dev/null
+++ b/tools/linter.py
@@ -0,0 +1,36 @@
+import os
+import os.path as osp
+import pathlib
+import subprocess
+
+
+def recursively_lint_files():
+    """Recursively lint all python files in chosen subdirectories of megatron-lm"""
+
+    try:
+        import autopep8
+    except ModuleNotFoundError:
+        print("Please first install autopep8 via `pip install autopep8`")
+        return
+
+    # get all python file paths from top level directory
+    file_dir = str(pathlib.Path(__file__).parent.absolute())
+    working_dir = osp.join(file_dir, os.pardir)
+    all_py_paths = set(os.path.join(working_dir, fname)
+                       for fname in os.listdir(working_dir) if ".py" in fname)
+
+    # get all python file paths from chosen subdirectories
+    check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
+    for sub_dir in check_dirs:
+        for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
+            all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
+
+    print("Linting the following: ")
+    for py_path in all_py_paths:
+        print(py_path)
+        command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
+        subprocess.check_call(command)
+
+
+if __name__ == "__main__":
+    recursively_lint_files()
diff --git a/tools/merge_mp_partitions.py b/tools/merge_mp_partitions.py
new file mode 100644
index 00000000..a41e644b
--- /dev/null
+++ b/tools/merge_mp_partitions.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Merge model parallel partitions."""
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+
+import torch
+
+from megatron import mpu
+from megatron.checkpointing import ensure_directory_exists
+from megatron.checkpointing import get_checkpoint_name
+from megatron.checkpointing import get_checkpoint_tracker_filename
+from megatron.global_vars import rebuild_tokenizer
+from megatron.global_vars import _parse_args
+
+
+def split_into_partitions(tensor, num_partitions, partition_dim, stride):
+
+    per_partition_size = mpu.utils.divide(tensor.size(partition_dim),
+                                          num_partitions)
+    per_partition_per_stride_size = mpu.utils.divide(per_partition_size, stride)
+
+    partitions_list = torch.split(tensor,
+                                  per_partition_per_stride_size,
+                                  dim=partition_dim)
+
+    partitions = []
+    for i in range(num_partitions):
+        partition = torch.cat(partitions_list[i::num_partitions],
+                              dim=partition_dim)
+        partitions.append(partition)
+
+    return partitions
+
+
+def merge_partitions(merged, partitions, partition_dim, stride):
+
+    # Number and size of each partition.
+    num_partitions = len(partitions)
+    per_partition_size = None
+    for partition in partitions:
+        if per_partition_size is None:
+            per_partition_size = partition.size(partition_dim)
+        else:
+            assert per_partition_size == partition.size(partition_dim)
+
+    def concat_partitions(partitions_):
+        with torch.no_grad():
+            if (per_partition_size * num_partitions) == merged.size(
+                    partition_dim):
+                torch.cat(partitions_, dim=partition_dim, out=merged)
+            else:
+                print('     ***WARNING*** sizes do not match. Will cut '
+                      'the merged partitions by {} along dimension {} '
+                      'to reduce the size from {} to {} ...'.format(
+                          (per_partition_size * num_partitions) - \
+                          merged.size(partition_dim), partition_dim,
+                          per_partition_size * num_partitions,
+                          merged.size(partition_dim)))
+                merged_ = torch.cat(partitions_, dim=partition_dim)
+                merged_split = torch.split(merged_, merged.size(partition_dim),
+                                           dim=partition_dim)
+                merged_ = merged_split[0]
+                assert merged_.size(partition_dim) == merged.size(partition_dim)
+                merged.data.copy_(merged_.data)
+
+    # If stride is 1, then do simple concatination.
+    if stride == 1:
+        concat_partitions(partitions)
+        return
+
+    # For none unity strides, first split based on stride and then group.
+    per_partition_per_stride_size = mpu.utils.divide(per_partition_size, stride)
+    # Chunk and build a list.
+    chunks = None
+    for i, partition in enumerate(partitions):
+        chunk = torch.split(partition,
+                            per_partition_per_stride_size,
+                            dim=partition_dim)
+
+        if chunks is None:
+            chunks = [0]*(num_partitions*len(chunk))
+        chunks[i::num_partitions] = chunk
+
+    # Concatinate.
+    concat_partitions(chunks)
+
+    return
+
+
+def get_model(model_type):
+
+    if model_type == 'BERT':
+        from pretrain_bert import model_provider
+    elif model_type == 'GPT2':
+        from pretrain_gpt2 import model_provider
+    elif model_type == 'RACE':
+        from tasks.race.finetune import model_provider
+    elif model_type == ['MNLI', 'QQP']:
+        num_classes = 2
+        if model_type == 'MNLI':
+            num_classes = 3
+        from megatron.model.classification import Classification
+        def model_provider():
+            return Classification(num_classes=num_classes, num_tokentypes=2)
+    else:
+        raise Exception('unrecognized model type: {}'.format(model_type))
+
+    model = model_provider()
+    model = model.half()
+
+    return model
+
+
+def get_parallel_checkpoint_name(path):
+
+    tracker_filename = get_checkpoint_tracker_filename(path)
+    iteration = 0
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        iteration = int(metastring)
+    assert iteration > 0
+    checkpoint_name = get_checkpoint_name(path, iteration)
+
+    return checkpoint_name, iteration
+
+
+def test_split_merge():
+
+    print('testing split and merge ...')
+
+    #[QKV.ROW-COL]
+    tensor = torch.FloatTensor([[1.11, 1.12, 1.13, 1.14, 1.15],
+                                [1.21, 1.22, 1.23, 1.24, 1.25],
+                                [1.31, 1.32, 1.33, 1.34, 1.35],
+                                [1.41, 1.42, 1.43, 1.44, 1.45],
+                                [2.11, 2.12, 2.13, 2.14, 2.15],
+                                [2.21, 2.22, 2.23, 2.24, 2.25],
+                                [2.31, 2.32, 2.33, 2.34, 2.35],
+                                [2.41, 2.42, 2.43, 2.44, 2.45],
+                                [3.11, 3.12, 3.13, 3.14, 3.15],
+                                [3.21, 3.22, 3.23, 3.24, 3.25],
+                                [3.31, 3.32, 3.33, 3.34, 3.35],
+                                [3.41, 3.42, 3.43, 3.44, 3.45]])
+
+    num_partitions = 2
+    partition_dim = 0
+    stride = 3
+    partitions = split_into_partitions(tensor, num_partitions,
+                                       partition_dim, stride)
+
+    merged = torch.zeros_like(tensor)
+    merge_partitions(merged, partitions, partition_dim, stride)
+
+    max_error = (merged - tensor).abs().max()
+    print('  > max error (should be zero): {}'.format(max_error))
+
+
+def get_mp_merge_args(parser):
+    """Provide extra arguments required for merging."""
+    group = parser.add_argument_group(title='mp merge')
+
+    group.add_argument('--model-type', type=str, required=True,
+                       choices=['BERT', 'GPT2', 'RACE', 'MNLI', 'QQP'],
+                       help='Type of the mdoel.')
+
+    return parser
+
+
+def main():
+
+    # Args
+    args = _parse_args(extra_args_provider=get_mp_merge_args)
+    model_type = args.model_type
+    orig_model_parallel_size = args.model_parallel_size
+    args.model_parallel_size = 1
+    tokenizer = rebuild_tokenizer(args)
+
+    print('\n merging model parallel partitions ...')
+    print(' > number of partitions: {}'.format(orig_model_parallel_size))
+    print(' > checkpoint path: {}'.format(args.load))
+    print(' > model parameters:')
+    print('    number of tokens ................ {} '.format(
+        tokenizer.vocab_size))
+    print('    number of layers ................ {}'.format(args.num_layers))
+    print('    hidden sise ..................... {}'.format(args.hidden_size))
+    print('    number of attention heads ....... {}'.format(
+        args.num_attention_heads))
+    print('    maximum position embeddings ..... {}'.format(
+        args.max_position_embeddings))
+
+    # Full model.
+    print('> building the full model ...')
+    mpu.initialize.set_model_parallel_world_size(1)
+    mpu.initialize.set_model_parallel_rank(0)
+    merged_model = get_model(model_type)
+
+    # Build and load partitions.
+    partitions = []
+    iteration = 0
+    args.model_parallel_size = orig_model_parallel_size
+    tokenizer = rebuild_tokenizer(args)
+    mpu.initialize.set_model_parallel_world_size(args.model_parallel_size)
+    for rank in range(args.model_parallel_size):
+        mpu.initialize.set_model_parallel_rank(rank)
+        checkpoint_name, iteration = get_parallel_checkpoint_name(args.load)
+        print('> loading {} ...'.format(checkpoint_name))
+        model_ = get_model(model_type)
+        sd = torch.load(checkpoint_name, map_location='cpu')
+        model_.load_state_dict(sd['model'])
+        partitions.append(model_)
+
+
+    # Parameter generators so we can loop through them semiltaneouly.
+    merged_params_gen = merged_model.named_parameters()
+    partitions_params_gen = [partition.named_parameters()
+                             for partition in partitions]
+    while True:
+        try:
+
+            # Get the params and check names.
+            name, merged_param = next(merged_params_gen)
+            print(' > working on {} ...'.format(name))
+            print('     merged         type: {}, size: {}'.format(
+                merged_param.dtype, list(merged_param.size())))
+            partitions_param = []
+            for rank, partition_params_gen in enumerate(partitions_params_gen):
+                partition_name, partition_param = next(partition_params_gen)
+                assert partition_name == name
+                partitions_param.append(partition_param)
+                print('     partition {}    type: {}, size: {}'.format(
+                    rank, partition_param.dtype, list(partition_param.size())))
+
+            # For the non-parallel parameters, simply copy the rank 0 values.
+            if not hasattr(merged_param, 'model_parallel'):
+                print('     none-parallel parameter, simple copy from rank 0')
+                with torch.no_grad():
+                    merged_param.data.copy_(partitions_param[0].data)
+            # For parallel parameters, merge the values
+            else:
+                print('     parallel parameter merge with stride {} along '
+                      'dimention {}'.format(merged_param.stride,
+                                            merged_param.partition_dim))
+                merge_partitions(merged_param,
+                                 partitions_param,
+                                 merged_param.partition_dim,
+                                 merged_param.stride)
+
+        except StopIteration:
+            break
+
+
+    # Save the model.
+    args.model_parallel_size = 1
+    mpu.initialize.set_model_parallel_rank(0)
+    sd = {}
+    sd['model'] = merged_model.state_dict_for_save_checkpoint()
+    sd['iteration'] = iteration
+    merged_path = os.path.join(args.load, 'merged')
+    checkpoint_name = get_checkpoint_name(merged_path, iteration)
+    ensure_directory_exists(checkpoint_name)
+    print('> saving merged model to {}'.format(checkpoint_name))
+    torch.save(sd, checkpoint_name)
+
+    print('done :-)')
+
+
+if __name__ == '__main__':
+
+    main()
diff --git a/tools/openwebtext/README.md b/tools/openwebtext/README.md
new file mode 100644
index 00000000..db55e100
--- /dev/null
+++ b/tools/openwebtext/README.md
@@ -0,0 +1,46 @@
+The following steps show how to prepare training dataset to train the mode.
+
+# Libraries to install
+
+```
+    pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
+    git clone https://github.com/mattilyra/LSH
+    cd LSH
+    python setup.py install
+``` 
+
+# Download the dataset
+
+1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
+2. Remove blacklisted URLs.
+```
+python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
+```
+3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
+
+4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
+
+# Prepare the data for GPT-2 training:
+
+1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
+```
+python cleanup_dataset.py <input data file> <output cleaned data filename>
+```
+2. Using LSH, find possible duplicates and store then in a file for later processing. This step can NOT be sharded and usually takes 12 to 24 hours for OpenWebText dataset.
+```
+python find_duplicates.py <input cleaned data file> <output possible duplicate urls filename>
+```
+3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
+```
+python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
+```
+4. Remove similar documents that were detected in the last step.
+```
+python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
+```
+
+5. Shuffle the dataset.
+```
+shuf <cleaned deduped data file> -o train_data.json
+```
+
diff --git a/tools/openwebtext/blacklist_urls.py b/tools/openwebtext/blacklist_urls.py
new file mode 100644
index 00000000..d9c6f20a
--- /dev/null
+++ b/tools/openwebtext/blacklist_urls.py
@@ -0,0 +1,312 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import glob
+import re
+import time
+import tldextract
+import sys
+
+
+# List of the domains to blacklist.
+domain_blacklist = set([
+    '500px',
+    'aapks',
+    'akamaihd',
+    'amazon',
+    'apple',
+    'artifactfire',
+    'artstation',
+    'awwni',
+    'bandcamp',
+    'battleforthenet',
+    'coinscalendar',
+    'dailymotion',
+    'deviantart',
+    'discord',
+    'discordapp',
+    'dlapkandroid',
+    'dropbox',
+    'e621',
+    'ebay',
+    'edealinfo',
+    'erome',
+    'eroshare',
+    'explosm',
+    'facebook',
+    'fbcdn',
+    'flickr',
+    'furaffinity',
+    'futhead',
+    'gatopardo',
+    'gfycat',
+    'gifsound',
+    'gifsoup',
+    'giphy',
+    'github',
+    'google',
+    'gunprime',
+    'gyazo',
+    'hotdealstar',
+    'imagefap',
+    'imageshack',
+    'imgflip',
+    'imgur',
+    'instagram',
+    'karmadecay',
+    'kryptocal',
+    'kym-cdn',
+    'liveleak',
+    'livememe',
+    'lmgtfy',
+    'magaimg',
+    'memegenerator',
+    'minorplanetcenter',
+    'minus',
+    'mobafire',
+    'morejpeg',
+    'nocookie',
+    'pcpartpicker',
+    'photobucket',
+    'pinimg',
+    'pinterest',
+    'pixiv',
+    'pornhub',
+    'prntscr',
+    'puu',
+    'qkme',
+    'quickmeme',
+    'radd',
+    'redd',
+    'reddit',
+    'reddit-stream',
+    'redditlog',
+    'redditmedia',
+    'reddituploads',
+    'redtube',
+    'reupp',
+    'reverb',
+    'roanoke',
+    'rollingstone',
+    'sli',
+    'soundcloud',
+    'soundgasm',
+    'spankbang',
+    'spotify',
+    'strawpoll',
+    'streamable',
+    'timeanddate',
+    'tinypic',
+    'touhouradio',
+    'tumblr',
+    'twimg',
+    'twitch',
+    'twitter',
+    'vid',
+    'vimeo',
+    'vine',
+    'vkaao',
+    'vocaroo',
+    'voyagefusion',
+    'walmart',
+    'wciu',
+    'wikimedia',
+    'wikipedia',
+    'xhamster',
+    'xkcd',
+    'xvideos',
+    'youtu',
+    'youtube',
+    'youtubedoubler',
+    'ytimg',
+    'zillexplorer',
+])
+
+def domain_is_in_blacklist(url):
+    domain = tldextract.extract(url).domain
+    return domain in domain_blacklist
+
+
+# List of extentions to blacklist.
+extentions_blacklist = (
+    '.3gp',
+    '.7z'
+    '.ai',
+    '.aif',
+    '.apk',
+    '.app',
+    '.avi',
+    '.bin',
+    '.bmp',
+    '.bz2',
+    '.css',
+    '.csv',
+    '.dat',
+    '.deb',
+    '.dmg',
+    '.doc',
+    '.docx',
+    '.exe',
+    '.gif',
+    '.gifv',
+    '.gz',
+    '.iso',
+    '.jar',
+    '.jpeg',
+    '.jpg',
+    '.js',
+    '.log',
+    '.mid',
+    '.midi',
+    '.mkv',
+    '.mov',
+    '.mp3',
+    '.mp4',
+    '.mpeg',
+    '.mpg',
+    '.ogg',
+    '.ogv',
+    '.otf',
+    '.pdf',
+    '.pkg',
+    '.png',
+    '.pps',
+    '.ppt',
+    '.pptx',
+    '.psd',
+    '.py',
+    '.qt',
+    '.ram',
+    '.rar',
+    '.sql',
+    '.svg',
+    '.swf',
+    '.tar.gz',
+    '.tar',
+    '.tgz',
+    '.tiff',
+    '.ttf',
+    '.txt',
+    '.wav',
+    '.webm',
+    '.wma',
+    '.wmv',
+    '.xls',
+    '.xlsx',
+    '.xml',
+    '.xz',
+    '.zip',
+)
+
+def extention_is_in_blacklist(url):
+    if url.split('?')[0].lower().endswith(extentions_blacklist):
+        return True
+    return False
+
+
+# Malformed urls.
+# This function is adapted from:
+#   https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
+url_regex = re.compile(
+    r'^(?:http)s?://' # http:// or https://
+    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
+    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
+    r'(?::\d+)?' # optional port
+    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+def url_is_malformed(url):
+    return re.match(url_regex, url) is None
+
+
+def print_progress(prefix, start_time, urls_counter,
+                   domain_blacklist_counter,
+                   extention_blacklist_counter,
+                   short_url_counter, malformed_url_counter,
+                   duplicate_url_counter):
+    string = prefix + ' | '
+    string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time)
+    string += 'number of urls: {} | '.format(urls_counter)
+    string += 'domain blacklisted: {} | '.format(domain_blacklist_counter)
+    string += 'extention blacklisted: {} | '.format(extention_blacklist_counter)
+    string += 'short urls (<=8): {} | '.format(short_url_counter)
+    string += 'malformed urls: {} | '.format(malformed_url_counter)
+    string += 'duplicate urls: {}'.format(duplicate_url_counter)
+    print(string, flush=True)
+
+
+if __name__ == '__main__':
+
+
+    print('remove blacklisted urls ..')
+
+    # Path to the url files.
+    path = sys.argv[1]
+    # Output url file.
+    output = sys.argv[2]
+
+    # Get the list of url files.
+    files = glob.glob(path + '/*.txt')
+    print('> found {} files'.format(len(files)))
+
+    urls = set()
+    urls_counter = 0
+    domain_blacklist_counter = 0
+    extention_blacklist_counter = 0
+    short_url_counter = 0
+    malformed_url_counter = 0
+    duplicate_url_counter = 0
+    start_time = time.time()
+    for filename in files:
+        with open(filename, 'r') as f:
+            for line in f:
+                url = line.strip()
+                urls_counter += 1
+                if domain_is_in_blacklist(url):
+                    print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True)
+                    domain_blacklist_counter += 1
+                elif extention_is_in_blacklist(url):
+                    print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True)
+                    extention_blacklist_counter += 1
+                elif len(url) <= 8:
+                    print('[SHORT URL]: {}'.format(url), flush=True)
+                    short_url_counter += 1
+                elif url_is_malformed(url):
+                    print('[MALFORMED URL]: {}'.format(url), flush=True)
+                    malformed_url_counter += 1
+                elif url in urls:
+                    print('[DUPLICATE URL]: {}'.format(url), flush=True)
+                    duplicate_url_counter += 1
+                else:
+                    urls.add(url)
+                if urls_counter % 100000 == 0:
+                    print_progress('PROGRESS', start_time, urls_counter,
+                                   domain_blacklist_counter,
+                                   extention_blacklist_counter,
+                                   short_url_counter, malformed_url_counter,
+                                   duplicate_url_counter)
+
+    print_progress('FINAL', start_time, urls_counter,
+                   domain_blacklist_counter,
+                   extention_blacklist_counter,
+                   short_url_counter, malformed_url_counter,
+                   duplicate_url_counter)
+
+    # Write the final set of urls.
+    print('> writing cleaned up url list to {}'.format(output))
+    with open(output, 'w') as f:
+        for url in urls:
+            f.write(url + '\n')
+
+    print('done :-)')
diff --git a/tools/openwebtext/cleanup_dataset.py b/tools/openwebtext/cleanup_dataset.py
new file mode 100644
index 00000000..77dfbf70
--- /dev/null
+++ b/tools/openwebtext/cleanup_dataset.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ftfy
+import json
+from langdetect import detect
+import numpy as np
+import time
+import os
+import sys
+
+from tokenizer import Tokenizer
+
+MIN_DOCUMENT_LENGHT = 128
+
+
+def print_progress(prefix, start_time, num_docs, num_fixed_text,
+                   num_non_english_docs, chars_non_english_docs,
+                   num_small_docs, chars_small_docs):
+
+    string = prefix + ' | '
+    string += 'elapsed time: {:.2f} | '.format(time.time() - start_time)
+    string += 'documents: {} | '.format(num_docs)
+    string += 'fixed text: {} | '.format(num_fixed_text)
+    string += 'non-english: {} | '.format(num_non_english_docs)
+    string += 'non-english chars: {} | '.format(chars_non_english_docs)
+    string += 'small docs: {} | '.format(num_small_docs)
+    string += 'small docs chars: {}'.format(chars_small_docs)
+    print(string, flush=True)
+
+
+def filter_corpus(filename, out_filename, print_interval=10000):
+
+    print(' > filtering {}'.format(filename))
+
+    tokenizer = Tokenizer(cache_dir='./cache')
+
+    num_docs = 0
+    num_written_docs = 0
+    num_small_docs = 0
+    num_fixed_text = 0
+    num_non_english_docs = 0
+    chars_non_english_docs = 0
+    chars_small_docs = 0
+    start_time = time.time()
+    with open(out_filename, 'wb') as f:
+        with open(filename, 'r') as fin:
+            for line in fin:
+                try:
+                    num_docs += 1
+                    myjson = json.loads(line)
+                    # Fix text
+                    text = ftfy.fix_text(myjson['text'])
+                    if text != myjson['text']:
+                        num_fixed_text += 1
+                    myjson['text'] = text
+                    # Detect language.
+                    if detect(text) != 'en':
+                        print('[non-english text]', myjson)
+                        num_non_english_docs += 1
+                        chars_non_english_docs += len(text)
+                        continue
+                    # On average each token is 5 characters so 8 is an
+                    # upper bound.
+                    if len(text) < (8 * MIN_DOCUMENT_LENGHT):
+                        tokens = tokenizer.tokenize_document(text)
+                        if len(tokens) < MIN_DOCUMENT_LENGHT:
+                            print('[small document, skipping]:', myjson)
+                            num_small_docs += 1
+                            chars_small_docs += len(text)
+                            continue
+                    myjson = json.dumps(myjson, ensure_ascii=False)
+                    f.write(myjson.encode('utf-8'))
+                    f.write('\n'.encode('utf-8'))
+                    num_written_docs += 1
+                    if num_docs % print_interval == 0:
+                        print_progress('[PROGRESS]', start_time, num_docs,
+                                       num_fixed_text, num_non_english_docs,
+                                       chars_non_english_docs,
+                                       num_small_docs, chars_small_docs)
+                except Exception as e:
+                    print('    skipping ', line, e)
+
+    print_progress('[FINAL]', start_time, num_docs,
+                   num_fixed_text, num_non_english_docs,
+                   chars_non_english_docs,
+                   num_small_docs, chars_small_docs)
+
+
+if __name__ == '__main__':
+
+    print('building gpt2 dataset ...')
+
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+
+    print('will be reading {}'.format(input_filename))
+    print('and will write the results to {}'.format(output_filename))
+
+    filter_corpus(input_filename, output_filename)
+
+
diff --git a/tools/openwebtext/find_duplicates.py b/tools/openwebtext/find_duplicates.py
new file mode 100644
index 00000000..455f43ae
--- /dev/null
+++ b/tools/openwebtext/find_duplicates.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import json
+from lsh import cache, minhash
+import time
+import sys
+
+
+# This function is adapted from:
+#   https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
+def shingles(text, char_ngram=5):
+    return set(text[head:head + char_ngram]
+               for head in range(0, len(text) - char_ngram))
+
+
+# This function is adapted from:
+#  https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
+def jaccard(set_a, set_b):
+    intersection = set_a & set_b
+    union = set_a | set_b
+    return len(intersection) / len(union)
+
+
+if __name__ == '__main__':
+
+    print('finding possible duplicate content ...')
+
+    input = sys.argv[1]
+    output = sys.argv[2]
+
+    hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4)
+    lshcache = cache.Cache(bands=10, hasher=hasher)
+
+    counter = 0
+    url_doc = {}
+    start_time = time.time()
+    with open(input, 'r') as f:
+        for line in f:
+            try:
+                myjson = json.loads(line)
+                url = myjson['url']
+                text = myjson['text']
+                counter += 1
+                url_doc[url] = text
+                lshcache.add_fingerprint(hasher.fingerprint(text), url)
+            except Exception as e:
+                print('Error:', e)
+            if counter % 10000 == 0:
+                print(' [read]> processed {} documents in {:.2f} seconds ...'.
+                      format(counter, time.time() - start_time), flush=True)
+
+    counter = 0
+    start_time = time.time()
+    deduped = 0
+    with open(output, 'wb') as f:
+        for b in lshcache.bins:
+            for bucket_id in b:
+                if len(b[bucket_id]) > 1:
+                    items = list(b[bucket_id])
+                    main_url = items[0]
+                    main_dhingles = shingles(url_doc[main_url])
+                    remove_urls = []
+                    for i in range(1, len(items)):
+                        counter += 1
+                        other_url= items[i]
+                        other_shingles = shingles(url_doc[other_url])
+                        try:
+                            jaccard_sim = jaccard(main_dhingles, other_shingles)
+                        except Exception as e:
+                            print('Error:', e)
+                        if jaccard_sim > 0.5:
+                            remove_urls.append({other_url: jaccard_sim})
+                            deduped += 1
+                        if counter % 10000 == 0:
+                            print(' [write]> processed {} documents in {:.2f} '
+                                  'seoncds and deduped {} documents ...'.
+                                  format(counter, time.time() - start_time,
+                                         deduped), flush=True)
+                    if len(remove_urls) > 0:
+                        myjson = json.dumps({main_url: remove_urls},
+                                            ensure_ascii=False)
+                        f.write(myjson.encode('utf-8'))
+                        f.write('\n'.encode('utf-8'))
+
+    print('done :-)')
diff --git a/tools/openwebtext/group_duplicates_url.py b/tools/openwebtext/group_duplicates_url.py
new file mode 100644
index 00000000..3622ce32
--- /dev/null
+++ b/tools/openwebtext/group_duplicates_url.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import time
+import sys
+
+
+def is_similar(jaccard_similarity):
+    return (js >= 0.9)
+
+
+if __name__ == '__main__':
+
+
+    print('grouping duplicate urls ...')
+
+    input = sys.argv[1]
+    output = sys.argv[2]
+
+    url_to_index = {}
+    index_to_urls = []
+    counter = 0
+    start_time = time.time()
+    with open(input, 'r') as f:
+        for line in f:
+            counter += 1
+            myjson = json.loads(line)
+            urls = []
+            for main_url in myjson.keys():
+                urls.append(main_url)
+                for value in myjson[main_url]:
+                    for other_url, js in value.items():
+                        if is_similar(js):
+                            urls.append(other_url)
+            current_index = -1
+            other_indices = set()
+            for url in urls:
+                if url in url_to_index:
+                    if current_index == -1:
+                        current_index = url_to_index[url]
+                    elif current_index != url_to_index[url]:
+                        other_indices.add(url_to_index[url])
+            if current_index == -1:
+                current_index = len(index_to_urls)
+                index_to_urls.append(set())
+            for url in urls:
+                url_to_index[url] = current_index
+                index_to_urls[current_index].add(url)
+            for index in other_indices:
+                for url in index_to_urls[index]:
+                    index_to_urls[current_index].add(url)
+                    url_to_index[url] = current_index
+                index_to_urls[index] = None
+
+            if counter % 100000 == 0:
+                print(' > processed {} lines in {} seconds ...'.format(
+                    counter, time.time() - start_time))
+
+
+    total_remove = 0
+    total_remain = 0
+    for urls in index_to_urls:
+        if urls is not None:
+            if len(urls) > 1:
+                total_remove += (len(urls) - 1)
+                total_remain += 1
+    print('out of {} urls, only {} are unique and {} should be removed'.format(
+        total_remove+total_remain, total_remain, total_remove))
+
+    with open(output, 'wb') as f:
+        for i, urls in enumerate(index_to_urls):
+            if urls is not None:
+                if len(urls) > 1:
+                    myjson = json.dumps({str(i): list(urls)},
+                                        ensure_ascii=False)
+                    f.write(myjson.encode('utf-8'))
+                    f.write('\n'.encode('utf-8'))
diff --git a/tools/openwebtext/merge_jsons.py b/tools/openwebtext/merge_jsons.py
new file mode 100644
index 00000000..ad5ed312
--- /dev/null
+++ b/tools/openwebtext/merge_jsons.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import glob
+import sys
+import json
+import argparse
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--json_path", type=str, default=".",
+        help="path where all the json files are located")
+
+    parser.add_argument("--output_file", type=str, default="merged_output.json",
+        help="filename where the merged json should go")
+
+    args = parser.parse_args()
+
+    json_path = args.json_path
+    out_file = args.output_file
+
+    json_files = glob.glob(json_path + '/*.json')
+
+    counter = 0
+
+    with open(out_file, 'w') as outfile:
+        for fname in json_files:
+            counter += 1
+
+            if counter % 1024 == 0:
+                print("Merging at ", counter, flush=True)
+
+            with open(fname, 'r') as infile:
+                for row in infile:
+                    each_row = json.loads(row)
+                    outfile.write(row)
+
+
+    print("Merged file", out_file, flush=True)
+
+
diff --git a/tools/openwebtext/remove_group_duplicates.py b/tools/openwebtext/remove_group_duplicates.py
new file mode 100644
index 00000000..b6f580ef
--- /dev/null
+++ b/tools/openwebtext/remove_group_duplicates.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import time
+import sys
+
+
+if __name__ == '__main__':
+
+    url_filename = sys.argv[1]
+    data_filename = sys.argv[2]
+    output_filename = sys.argv[3]
+
+    urls = set()
+    with open(url_filename, 'r') as f:
+        for line in f:
+            myjson = json.loads(line)
+            for key in myjson:
+                this_urls = myjson[key]
+                for i in range(1, len(this_urls)):
+                    urls.add(this_urls[i])
+    print('will be removing {} urls'.format(len(urls)), flush=True)
+
+    written_docs = 0
+    removed_docs = 0
+    removed_chars = 0
+    start_time = time.time()
+    with open(output_filename, 'wb') as fout:
+        with open(data_filename, 'r') as fin:
+            for line in fin:
+                try:
+                    myjson = json.loads(line)
+                    url = myjson['url']
+                    if url in urls:
+                        print('removing', myjson)
+                        removed_docs += 1
+                        removed_chars += len(myjson['text'])
+                        continue
+                    myjson = json.dumps(myjson, ensure_ascii=False)
+                    fout.write(myjson.encode('utf-8'))
+                    fout.write('\n'.encode('utf-8'))
+                    written_docs += 1
+                    if written_docs % 10000 == 0:
+                        print(' [PROCESSED] time (s): {:.2f} | written: {} '
+                              '| removed: {} (char: {})'.format(
+                                  time.time() - start_time,
+                                  written_docs, removed_docs, removed_chars))
+                except Exception as e:
+                    print('[SKIPPING]', line, e)
+
+    print(' [PROCESSED] time (s): {:.2f} | written: {} '
+          '| removed: {} (char: {})'.format(
+              time.time() - start_time,
+              written_docs, removed_docs, removed_chars))
+    print('done :-)')
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
new file mode 100644
index 00000000..c5f13922
--- /dev/null
+++ b/tools/preprocess_data.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processing data for pretraining."""
+
+import argparse
+import json
+import multiprocessing
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+import time
+
+import torch
+try:
+    import nltk
+    nltk_available = True
+except ImportError:
+    nltk_available = False
+
+from megatron.tokenizer import build_tokenizer
+from megatron.data import indexed_dataset
+
+
+# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+        if self.args.split_sentences:
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            splitter = nltk.load("tokenizers/punkt/english.pickle")
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text = splitter._params,
+                    lang_vars = CustomLanguageVars())
+            else:
+                Encoder.splitter = splitter
+
+        else:
+            Encoder.splitter = IdentitySplitter()
+
+    def encode(self, json_line):
+        data = json.loads(json_line)
+        ids = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            doc_ids = []
+            for sentence in Encoder.splitter.tokenize(text):
+                sentence_ids = Encoder.tokenizer.tokenize(sentence)
+                if len(sentence_ids) > 0:
+                    doc_ids.append(sentence_ids)
+            if self.args.append_eod:
+                doc_ids[-1].append(Encoder.tokenizer.eod)
+            ids[key] = doc_ids
+        return ids, len(json_line)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title='input data')
+    group.add_argument('--input', type=str, required=True,
+                       help='Path to input JSON')
+    group.add_argument('--json-keys', nargs='+', default=['text'],
+                       help='space separate listed of keys to extract from json')
+    group.add_argument('--split-sentences', action='store_true',
+                       help='Split documents into sentences.')
+    group.add_argument('--keep-newlines', action='store_true',
+                       help='Keep newlines between sentences when splitting.')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
+                                'GPT2BPETokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+    group.add_argument('--append-eod', action='store_true',
+                       help='Append an <eod> token to the end of a document.')
+
+
+    group = parser.add_argument_group(title='output data')
+    group.add_argument('--output-prefix', type=str, required=True,
+                       help='Path to binary output file without suffix')
+    group.add_argument('--dataset-impl', type=str, default='mmap',
+                       choices=['lazy', 'cached', 'mmap'])
+
+    group = parser.add_argument_group(title='runtime')
+    group.add_argument('--workers', type=int, default=1,
+                       help='Number of worker processes to launch')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Interval between progress updates')
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    if args.tokenizer_type.lower().startswith('bert'):
+        if not args.split_sentences:
+            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.model_parallel_size = 1
+
+    return args
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    print("Opening", args.input)
+    fin = open(args.input, 'r', encoding='utf-8')
+
+    if nltk_available and args.split_sentences:
+        nltk.download("punkt", quiet=True)
+
+    encoder = Encoder(args)
+    tokenizer = build_tokenizer(args)
+    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+    encoded_docs = pool.imap(encoder.encode, fin, 25)
+    #encoded_docs = map(encoder.encode, fin)
+
+    level = "document"
+    if args.split_sentences:
+        level = "sentence"
+
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.json_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
+                                                      key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
+                                                      key, level)
+        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
+                                               impl=args.dataset_impl,
+                                               vocab_size=tokenizer.vocab_size)
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+        for key, sentences in doc.items():
+            for sentence in sentences:
+                builders[key].add_item(torch.IntTensor(sentence))
+            builders[key].end_document()
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {i} documents",
+                  f"({i/elapsed} docs/s, {mbs} MB/s).",
+                  file=sys.stderr)
+
+    for key in args.json_keys:
+        builders[key].finalize(output_idx_files[key])
+
+if __name__ == '__main__':
+    main()
diff --git a/utils.py b/utils.py
deleted file mode 100644
index b9bd689d..00000000
--- a/utils.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utilities for logging and serialization"""
-
-import os
-import random
-import time
-import numpy as np
-import torch
-
-
-class Timers:
-    """Group of timers."""
-
-    class Timer:
-        """Timer."""
-
-        def __init__(self, name):
-            self.name_ = name
-            self.elapsed_ = 0.0
-            self.started_ = False
-            self.start_time = time.time()
-
-        def start(self):
-            """Start the timer."""
-            assert not self.started_, 'timer has already been started'
-            torch.cuda.synchronize()
-            self.start_time = time.time()
-            self.started_ = True
-
-        def stop(self):
-            """Stop the timer."""
-            assert self.started_, 'timer is not started'
-            torch.cuda.synchronize()
-            self.elapsed_ += (time.time() - self.start_time)
-            self.started_ = False
-
-        def reset(self):
-            """Reset timer."""
-            self.elapsed_ = 0.0
-            self.started_ = False
-
-        def elapsed(self, reset=True):
-            """Calculate the elapsed time."""
-            started_ = self.started_
-            # If the timing in progress, end it first.
-            if self.started_:
-                self.stop()
-            # Get the elapsed time.
-            elapsed_ = self.elapsed_
-            # Reset the elapsed time
-            if reset:
-                self.reset()
-            # If timing was in progress, set it back.
-            if started_:
-                self.start()
-            return elapsed_
-
-    def __init__(self):
-        self.timers = {}
-
-    def __call__(self, name):
-        if name not in self.timers:
-            self.timers[name] = self.Timer(name)
-        return self.timers[name]
-
-    def log(self, names, normalizer=1.0, reset=True):
-        """Log a group of timers."""
-        assert normalizer > 0.0
-        string = 'time (ms)'
-        for name in names:
-            elapsed_time = self.timers[name].elapsed(
-                reset=reset) * 1000.0/ normalizer
-            string += ' | {}: {:.2f}'.format(name, elapsed_time)
-        print(string, flush=True)
-
-
-def report_memory(name):
-    """Simple GPU memory report."""
-
-    mega_bytes = 1024.0 * 1024.0
-    string = name + ' memory (MB)'
-    string += ' | allocated: {}'.format(
-        torch.cuda.memory_allocated() / mega_bytes)
-    string += ' | max allocated: {}'.format(
-        torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
-    string += ' | max cached: {}'.format(
-        torch.cuda.max_memory_cached()/ mega_bytes)
-    print(string, flush=True)
-
-
-def load_checkpoint(model, optimizer, lr_scheduler, args):
-    """Load a model checkpoint."""
-
-    checkpoint_path = args.load
-    model_path = checkpoint_path
-    model_sd = torch.load(model_path, map_location='cpu')
-    total_iters = model_sd['total_iters']
-    epoch = model_sd['epoch']
-    i = model_sd['mid_epoch_iters']
-    model.load_state_dict(model_sd['sd'])
-
-    checkpoint_path = os.path.dirname(checkpoint_path)
-    if args.load_optim:
-        optim_path = os.path.join(checkpoint_path, 'optim.pt')
-        optim_sd, lr_sd = torch.load(optim_path, map_location='cpu')
-        optimizer.load_state_dict(optim_sd)
-        lr_scheduler.load_state_dict(lr_sd)
-    elif args.fp16:
-        optimizer._model_params_to_master_params()
-
-    rng_path = None
-    if args.load_rng:
-        rng_path = os.path.join(checkpoint_path, 'rng.pt')
-    if args.load_all_rng:
-        rng_path = os.path.join(checkpoint_path,
-                                'rng.%d.pt'%(torch.distributed.get_rank()))
-    if rng_path is not None:
-        rng_state = torch.load(rng_path)
-        torch.cuda.set_rng_state(rng_state[0])
-        torch.set_rng_state(rng_state[1])
-        np.random.set_state(rng_state[2])
-        random.setstate(rng_state[3])
-
-    return epoch, i, total_iters
-
-
-def save_checkpoint(model_suffix, epoch, i, model, optimizer, lr_scheduler, args):
-    """Save a model checkpoint."""
-
-    model_path = os.path.join(args.save, model_suffix)
-    checkpoint_dir = os.path.dirname(model_path)
-    rng_state = (torch.cuda.get_rng_state(),
-                 torch.get_rng_state(),
-                 np.random.get_state(),
-                 random.getstate())
-    if not (torch.distributed.is_initialized() and \
-            torch.distributed.get_rank() > 1):
-        if not os.path.exists(checkpoint_dir):
-            os.makedirs(checkpoint_dir)
-        total_iters = args.train_iters * (epoch-1) + i
-        sd = {'sd': model.state_dict()}
-        sd['total_iters'] = total_iters
-        sd['epoch'] = epoch
-        sd['mid_epoch_iters'] = i
-        torch.save(sd, model_path)
-        print('saved', model_path)
-
-        if args.save_optim:
-            optim_path = os.path.join(checkpoint_dir, 'optim.pt')
-            torch.save((optimizer.state_dict(),
-                        lr_scheduler.state_dict()), optim_path)
-            print('saved', optim_path)
-
-        if args.save_rng:
-            rng_path = os.path.join(checkpoint_dir, 'rng.pt')
-            torch.save(rng_state, rng_path)
-            print('saved', rng_path)
-    else:
-        while not os.path.exists(checkpoint_dir):
-            time.sleep(1)
-    if args.save_all_rng:
-        rng_path = os.path.join(checkpoint_dir,
-                                'rng.%d.pt'%(torch.distributed.get_rank()))
-        torch.save(rng_state, rng_path)
-        print('saved', rng_path)