diff --git a/.gitignore b/.gitignore
index 34de4e77..11d3c094 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,10 @@ __pycache__/
 
 # C extensions
 *.so
+save
+
+# zarr file output
+*.zarr
 
 # Distribution / packaging
 .Python
diff --git a/LICENSE b/LICENSE
index 99cf9988..cd61819d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -202,6 +202,422 @@
 
 --
 
+This repository contains code of lag-GPT model from pytorch-transformer-ts repository (https://github.com/kashif/pytorch-transformer-ts) and code from GluonTS repository (https://github.com/awslabs/gluonts) from Amazon Web Services - Labs. This repository also contains code with GPT-NeoX with code from Hugging Face Inc., Google Research,
+and Facebook (from their Fairseq project). Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+
+
+
+------------- LICENSE FOR GluonTS code  --------------
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+
+------------- LICENSE FOR lag-GPT code  --------------
+MIT License
+
+Copyright (c) 2022 Kashif Rasul
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+
+------------- LICENSE FOR GPT-NeoX code  --------------
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+--
+
 This repository also contains code from Hugging Face Inc., Google Research,
 and Facebook (from their Fairseq project). Files from these
 organizations have notices at the top of each file. Below are licenses
diff --git a/README.md b/README.md
index c96692c6..d90041f4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,121 @@
-[![GitHub issues](https://img.shields.io/github/issues/EleutherAI/gpt-neox)](https://github.com/EleutherAI/gpt-neox/issues)
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>](https://wandb.ai/eleutherai/neox)
+# Times-NeoX
+
+This repository is a fork from [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) to implement a model for univariate time series forecasting. The model is based on [lag-GPT](https://github.com/kashif/pytorch-transformer-ts/tree/main/lag-gpt) and [GluonTS](https://ts.gluon.ai/) with [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) engine. To create Times-NeoX model, we replaced the embedding and softmax layers of GPT model with a projection layer and density model head, respectively.  
+
+![Alt text](images/TimesNeoX.svg?raw=true "GPT vs Times")
+
+## Training and inference
+We adapted ***train.py*** and ***generate.py*** from GPT-NeoX training and inference scripts, the new scripts have a suffix "-times": train-times.py and generate-times.py. GPT-NeoX original scripts were renamed into trainGPT.py and generateGPT.py. Please refer to GPT-NeoX documentation how to launch the scripts.
+
+## Config files
+Please see an example of config file in config/times_config folder. 
+
+## Options
+The model uses most of the options of GPT-NeoX (please see documentation for GPT-NeoX below) with an addition of "times_args" arguments: 
+
+```json
+"times_args": {
+    "context_length": 1024,
+    "prediction_length": 17,
+    "scaling": "std",
+    "shuffle_buffer_length": 1000,
+    "padding_value": 0,
+    "data_seed": 10,
+
+    "inference": {
+      "num_test_batches": 2,
+      "file_name": "output.zarr",
+      "chunk_size": 128
+    },
+
+    "datasets":{
+
+      "train": [
+        "airpassengers", "australian_electricity_demand", "car_parts_without_missing",
+      ],
+      "validation": [
+        "cif_2016", "covid_deaths", "electricity", "electricity_weekly", "exchange_rate",
+      ],
+      "test":[
+        "airpassengers", "australian_electricity_demand",
+        "cif_2016", "covid_deaths", "electricity", "electricity_weekly", "exchange_rate",
+      ],
+
+    "augmentation": {
+      "enabled": false,
+      "prob": 0.5,
+      "transforms": {
+          "freq_mask": {
+              "weight": 0.0,
+              "options": {
+                  "rate": 0.01
+              }
+          },
+          "freq_mix": {
+              "weight": 0.0,
+              "options": {
+                  "rate": 0.01
+              }
+          },
+          "permutation": {
+              "weight": 0.0,
+              "options": {
+                  "max_segments": 7,
+                  "seg_mode": "random"
+              }
+          },
+          "rotation": {
+              "weight": 0.0
+          },
+          "magnitude_warp": {
+              "weight": 0.0,
+              "options": {
+                  "sigma": 0.7,
+                  "knot": 4
+              }
+          },
+          "time_warp": {
+              "weight": 0.0,
+              "options": {
+                  "sigma": 0.7,
+                  "knot": 4
+              }
+          },
+          "window_slice": {
+              "weight": 0.0,
+              "options": {
+                  "reduce_ratio": 0.7,
+              }
+          },
+          "window_warp": {
+              "weight": 1.0,
+              "options": {
+                  "window_ratio": 0.2,
+                  "scales": [0.5, 2.0],
+              }
+          }
+      }
+    },
+  }
+}
+```
+
+### Model input
+The dataloaders sample intervals from dataset time series of **context_length** + **prediction_length** lengths and pseudo-shuffle samples with **shuffle_buffer_length**. **data_seed** sets a seed for dataloaders. Not-observed values in datasets are replaced with **padding_value**. During training of the autoregressive model, values in context and prediction windows are treated the same. However, input time series values are normalized using **scaling** scaler trained on context window.
+During inference stage, the minimum length of the input is of **context_length**.
+
+### Datasets
+**datasets** option sets the list of training, validation, and testing datasets from GluonTS library and the list of possible augmentations (see below).
+
+### Augmentation
+Augmentation of time-series is set in the **augmentation** option. **prob** defines the probability to apply augmentation. Each transform probability is weighted with **weight** option. 
+
+### Inference
+
+Use ***generate-times.py*** to predict time series for GluonTS datasets from the **test** list. Specify number of batches for inference in **num_test_batches**. Results are saved into a file with [zarr](https://zarr.readthedocs.io/en/stable/) format. Each data-parallel partition writes into a separate group of the zarr file. Groups have *ground_truth*, *past_target*, and *output* arrays. *past_target* is a context window. *ground_truth* is a ground truth for the future window, *output* is model output for the future window. Please see ***print_zarr.py*** in [tools](/tools/) folder to create graphs of series from zarr file and save them to PDF.    
+
 
-# GPT-NeoX
+# README from GPT-NeoX
 
 This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training.
 
diff --git a/configs/times_configs/49M.yml b/configs/times_configs/49M.yml
new file mode 100644
index 00000000..6880a9cf
--- /dev/null
+++ b/configs/times_configs/49M.yml
@@ -0,0 +1,217 @@
+{
+  #"save": "save",
+  #"load": "save",
+  #"checkpoint_factor": 1000,
+  #"extra_save_iters": [10, 20, 30],
+  #"keep_last_n_checkpoints": 3,
+  #"checkpoint-scale": "linear",
+
+  "gradient_accumulation_steps": 1,
+
+  "checkpoint": {
+    "tag_validation":"Warn",
+    "load_universal":false,
+    "use_node_local_storage":false,
+    "parallel_write": {
+        "pipeline_stage": false
+    },
+  },
+
+  # For TFLOPS calculation
+  "seq_length": 1040,
+
+
+  "num_gpus": 2,
+  # parallelism settings
+  "pipe_parallel_size": 2,
+  "model_parallel_size": 1,
+
+  "times_args": {
+    "context_length": 1024,
+    "prediction_length": 10,
+    "scaling": "std",
+    "shuffle_buffer_length": 1000,
+    "padding_value": 0,
+    "data_seed": 10,
+
+    "inference": {
+      "num_test_batches": 1,
+      "file_name": "output.zarr",
+      "chunk_size": 128
+    },
+
+    "datasets":{
+      "train": [
+        "airpassengers", "australian_electricity_demand", "car_parts_without_missing",
+        "cif_2016", "covid_deaths", "electricity", "electricity_weekly", "exchange_rate",
+        "fred_md", "hospital", "kaggle_web_traffic_weekly", "kdd_cup_2018_without_missing", 
+        "london_smart_meters_without_missing", "nn5_daily_with_missing", "nn5_weekly", "pedestrian_counts",
+        "rideshare_without_missing", "saugeenday", "solar-energy", "solar_10_minutes", "solar_weekly", "taxi_30min",
+        "temperature_rain_without_missing", "tourism_monthly", "uber_tlc_daily", "uber_tlc_hourly", "vehicle_trips_without_missing",
+        "weather", "wiki-rolling_nips", "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly", "m4_yearly", "wind_farms_without_missing"
+      ],
+      "validation": [
+        "airpassengers", "australian_electricity_demand", "car_parts_without_missing",
+        "cif_2016", "covid_deaths", "electricity", "electricity_weekly", "exchange_rate",
+        "fred_md", "hospital", "kaggle_web_traffic_weekly", "kdd_cup_2018_without_missing", 
+        "london_smart_meters_without_missing", "nn5_daily_with_missing", "nn5_weekly", "pedestrian_counts",
+        "rideshare_without_missing", "saugeenday", "solar-energy", "solar_10_minutes", "solar_weekly", "taxi_30min",
+        "temperature_rain_without_missing", "tourism_monthly", "uber_tlc_daily", "uber_tlc_hourly", "vehicle_trips_without_missing",
+        "weather", "wiki-rolling_nips", "m4_daily", "m4_hourly", "m4_monthly", "m4_quarterly", "m4_yearly", "wind_farms_without_missing"
+      ],
+      "test":[
+        "airpassengers", "australian_electricity_demand",
+      ],
+
+      "augmentation": {
+        "enabled": true,
+        "prob": 0.3,
+        "transforms": {
+            "freq_mask": {
+                "weight": 1.0,
+                "options": {
+                    "rate": 0.01
+                }
+            },
+            "freq_mix": {
+                "weight": 1.0,
+                "options": {
+                    "rate": 0.01
+                }
+            },
+            "permutation": {
+                "weight": 1.0,
+                "options": {
+                    "max_segments": 7,
+                    "seg_mode": "random"
+                }
+            },
+            "rotation": {
+                "weight": 1.0
+            },
+            "magnitude_warp": {
+                "weight": 1.0,
+                "options": {
+                    "sigma": 0.7,
+                    "knot": 4
+                }
+            },
+            "time_warp": {
+                "weight": 1.0,
+                "options": {
+                    "sigma": 0.7,
+                    "knot": 4
+                }
+            },
+            "window_slice": {
+                "weight": 1.0,
+                "options": {
+                    "reduce_ratio": 0.7,
+                }
+            },
+            "window_warp": {
+                "weight": 1.0,
+                "options": {
+                    "window_ratio": 0.2,
+                    "scales": [0.5, 2.0],
+                }
+            }
+        }
+      },
+
+
+    }
+  },  
+
+  # model settings
+  "num_layers": 10,
+  "hidden_size": 640,
+  "num_attention_heads": 10,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  # these should provide some speedup but takes a while to build, set to true if desired
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0008,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.00008,
+
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+  "csv_monitor": {
+    "enabled": true,
+    "output_path": "logs",
+    "job_name": "debug_run",
+  },
+
+  # batch / data settings
+  "train_micro_batch_size_per_gpu": 32,
+  "gas": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "fp32", 
+
+  # precision settings
+  "fp16": {
+    "fp16": false,
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+
+  # misc. training settings
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  #"eval_interval": 100000,
+  "eval_interval": 30,
+  "eval_iters": 10,
+
+  # logging
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+}
diff --git a/deepy.py b/deepy.py
index c158c76c..1be4efdb 100755
--- a/deepy.py
+++ b/deepy.py
@@ -33,7 +33,7 @@ def main():
     if wandb_token is not None:
         deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
         os.environ["WANDB_API_KEY"] = wandb_token
-
+    
     deepspeed.launcher.runner.main(deepspeed_main_args)
 
 
diff --git a/generate-times.py b/generate-times.py
new file mode 100644
index 00000000..859f3a22
--- /dev/null
+++ b/generate-times.py
@@ -0,0 +1,7 @@
+from megatron.laggpt.inference import inference, initialize
+from torch.distributed import barrier
+
+neox_args, model, times_envelope, data_iterator = initialize()
+inference(neox_args, model, times_envelope, data_iterator)
+
+barrier()
\ No newline at end of file
diff --git a/generate.py b/generateGPT.py
similarity index 100%
rename from generate.py
rename to generateGPT.py
diff --git a/images/TimesNeoX.svg b/images/TimesNeoX.svg
new file mode 100644
index 00000000..02cf2216
--- /dev/null
+++ b/images/TimesNeoX.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:lucid="lucid" width="1620" height="956.67"><g transform="translate(0 0)" lucid:page-tab-id="0_0"><path d="M0 0h1666v954H0z" fill="#fff"/><path d="M400 402.67a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6H406a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#a" transform="matrix(1,0,0,1,412,408.6666666666668) translate(47.98209635416673 39.734374999999986)"/><use xlink:href="#b" transform="matrix(1,0,0,1,412,408.6666666666668) translate(97.99023437500003 74.87434895833331)"/><path d="M400 702.67a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6H406a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#c" transform="matrix(1,0,0,1,412,708.6666666666667) translate(67.98535156250004 49.109374999999986)"/><path d="M595.33 606.67c0 11.96-11.34 21.66-25.33 21.66-14 0-25.33-9.7-25.33-21.66C544.67 594.7 556 585 570 585c14 0 25.33 9.7 25.33 21.67zm-50.66 0h50.66M570 585v43.33" stroke="#3a414a" stroke-width="4" fill="#fff"/><path d="M570 692.67v-39.6" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M572 694.67h-4v-2.05h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M570 636.8l4.63 14.28h-9.26z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M570 581V541.4" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M572 583.2l-2-.2-2 .18v-2.24h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M570 525.14l4.63 14.27h-9.26z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M20 562.67a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6H26a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#d" transform="matrix(1,0,0,1,32,568.6666666666667) translate(57.983723958333385 30.10937499999999)"/><use xlink:href="#e" transform="matrix(1,0,0,1,32,568.6666666666667) translate(77.9869791666667 65.24934895833331)"/><path d="M364 606.67h155.92" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M364.05 608.67H362v-4h2.05z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M536.2 606.67l-14.28 4.63v-9.27z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M400 214.33a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6H406a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#f" transform="matrix(1,0,0,1,412,220.33333333333343) translate(97.99023437500003 57.734374999999986)"/><path d="M570 392.67v-39.6" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M572 394.67h-4v-2.06h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M570 336.8l4.63 14.27h-9.26z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M400 26a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6H406a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#g" transform="matrix(1,0,0,1,412,32) translate(87.98860677083336 57.734374999999986)"/><path d="M570 204.33v-39.6" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M572 206.33h-4v-2.05h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M570 148.47l4.63 14.27h-9.26z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M450 882.67a6 6 0 0 1 6-6h228a6 6 0 0 1 6 6v48a6 6 0 0 1-6 6H456a6 6 0 0 1-6-6z" stroke="#000" stroke-opacity="0" stroke-width="4" fill="#fff" fill-opacity="0"/><use xlink:href="#h" transform="matrix(1,0,0,1,455,881.6666666666667) translate(54.99023437500002 33.984374999999986)"/><path d="M570 874.67V821.4" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M572 876.67h-4v-2.05h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M570 805.14l4.63 14.26h-9.26z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M1260 262.67a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6h-328a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#a" transform="matrix(1,0,0,1,1272,268.6666666666668) translate(47.98209635416673 39.734374999999986)"/><use xlink:href="#b" transform="matrix(1,0,0,1,1272,268.6666666666668) translate(97.99023437500003 74.87434895833331)"/><path d="M1260 562.67a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6h-328a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#i" transform="matrix(1,0,0,1,1272,568.6666666666667) translate(57.983723958333385 49.109374999999986)"/><path d="M1455.33 466.67c0 11.96-11.34 21.66-25.33 21.66-14 0-25.33-9.7-25.33-21.66 0-11.97 11.34-21.67 25.33-21.67 14 0 25.33 9.7 25.33 21.67zm-50.66 0h50.66M1430 445v43.33" stroke="#3a414a" stroke-width="4" fill="#fff"/><path d="M1430 552.67v-39.6" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M1432 554.67h-4v-2.05h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1430 496.8l4.63 14.28h-9.27z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M1430 441V401.4" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M1432 443.2l-2-.2-2 .18v-2.24h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1430 385.14l4.63 14.27h-9.27z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M880 422.67a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v88a6 6 0 0 1-6 6H886a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#d" transform="matrix(1,0,0,1,892,428.6666666666668) translate(57.983723958333385 30.10937499999999)"/><use xlink:href="#e" transform="matrix(1,0,0,1,892,428.6666666666668) translate(77.9869791666667 65.24934895833331)"/><path d="M1224 466.67h155.93" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M1224.05 468.67H1222v-4h2.05z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1396.18 466.67l-14.26 4.63v-9.27z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M1260 74.33a6 6 0 0 1 6-6h328a6 6 0 0 1 6 6v108a6 6 0 0 1-6 6h-328a6 6 0 0 1-6-6z" stroke="#3a414a" stroke-width="4" fill="#fff"/><use xlink:href="#j" transform="matrix(1,0,0,1,1272,80.33333333333331) translate(47.98209635416673 39.734374999999986)"/><use xlink:href="#k" transform="matrix(1,0,0,1,1272,80.33333333333331) translate(27.97884114583337 74.87434895833331)"/><use xlink:href="#l" transform="matrix(1,0,0,1,1272,80.33333333333331) translate(188.00488281249997 74.87434895833331)"/><path d="M1430 252.67v-39.6" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M1432 254.67h-4v-2.05h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1430 196.8l4.63 14.27h-9.27z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><path d="M1310 742.67a6 6 0 0 1 6-6h228a6 6 0 0 1 6 6v48a6 6 0 0 1-6 6h-228a6 6 0 0 1-6-6z" stroke="#000" stroke-opacity="0" stroke-width="4" fill="#fff" fill-opacity="0"/><g><use xlink:href="#h" transform="matrix(1,0,0,1,1315,741.6666666666667) translate(54.99023437500002 33.984374999999986)"/></g><path d="M1430 734.67V681.4" stroke="#3a414a" stroke-width="4" fill="none"/><path d="M1432 736.67h-4v-2.05h4z" stroke="#3a414a" stroke-width=".05" fill="#3a414a"/><path d="M1430 665.14l4.63 14.26h-9.27z" stroke="#3a414a" stroke-width="4" fill="#3a414a"/><defs><path fill="#3a414a" d="M709-1193V0H519v-1193H76v-156h1076v156H709" id="m"/><path fill="#3a414a" d="M839-1102c70 0 148 7 206 17v167c-112-18-268-36-363 15-129 69-208 203-208 395V0H294c-10-367 32-789-52-1082h171c21 75 41 161 48 250h5c67-152 152-270 373-270" id="n"/><path fill="#3a414a" d="M1000-272c3 95 12 159 101 161 21 0 41-3 59-7V-6c-44 10-86 16-139 16-141 2-191-84-197-217h-6C748-76 648 20 446 20c-207 0-318-120-318-322 0-266 194-348 454-354l236-4c12-191-40-305-222-305-140 0-220 47-232 172l-188-17c33-204 181-292 423-292 255 0 401 118 401 364v466zm-683-27c0 109 63 184 175 182 166-3 259-96 306-217 24-65 20-120 20-200-232 7-501-28-501 235" id="o"/><path fill="#3a414a" d="M706-1102c241 0 344 136 343 381V0H868v-695c1-168-57-273-220-268-190 6-283 138-283 336V0H185c-3-360 6-732-6-1082h170c4 54 7 126 8 185h3c63-121 164-204 346-205" id="p"/><path fill="#3a414a" d="M873-819c-18-114-119-146-250-146-163 0-245 50-245 151 0 151 170 148 294 185 182 54 388 94 388 320 0 240-189 325-439 329-245 4-410-69-454-268l159-31c24 133 136 168 295 165 144-2 270-31 270-171 0-164-195-160-331-202-167-52-350-87-350-299 0-218 173-315 413-313 220 2 373 77 412 260" id="q"/><path fill="#3a414a" d="M839-1335c-182-6-269 67-259 253h491v142H580V0H400v-940H138v-142h262c-15-293 132-408 418-402 94 2 200 7 281 21v145c-75-10-177-14-260-17" id="r"/><path fill="#3a414a" d="M615-1102c343 0 484 203 482 560-1 347-147 562-488 562-336 0-475-219-479-562-4-349 156-560 485-560zm-8 989c240 0 301-180 301-429 0-245-55-427-290-427-236 0-299 181-299 427 0 243 61 429 288 429" id="s"/><path fill="#3a414a" d="M904-1102c199 0 220 177 220 381V0H956v-686c-3-114 0-215-60-264-70-33-125-4-158 71-26 56-39 140-39 252V0H531v-686c-3-114-1-215-61-264-78-41-136 24-157 84-24 69-39 159-39 259V0H105c-3-360 6-732-6-1082h149c6 50 3 123 8 175 36-100 83-195 216-195 135 0 166 79 196 196 42-105 93-196 236-196" id="t"/><path fill="#3a414a" d="M617-1102c355 0 481 238 477 599H322c5 222 84 388 301 388 144 0 244-59 284-166l158 45C1002-72 854 20 623 20c-342 0-490-220-490-568 0-346 151-554 484-554zm291 461c-18-192-90-328-289-328-194 0-287 128-295 328h584" id="u"/><g id="a"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#m"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#n"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#o"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#q"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#r"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,140.0227864583333,0)" xlink:href="#n"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,160.0260416666666,0)" xlink:href="#t"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,180.02929687499991,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,200.03255208333323,0)" xlink:href="#n"/></g><path fill="#3a414a" d="M802-711c201 25 350 118 350 331C1152-95 921 0 634 0H162v-1349c401 9 908-74 908 327 0 184-111 275-268 311zm-224-69c174-2 300-51 300-218 0-163-124-198-302-198H353v416h225zM353-153c281-2 612 44 606-244-5-271-329-233-606-234v478" id="v"/><path fill="#3a414a" d="M736-142h380V0H134v-142h422v-1200H267v-142h469v1342" id="w"/><path fill="#3a414a" d="M631 20c-350 0-501-215-501-562 0-355 162-560 502-560 250 0 399 118 446 323l-192 14c-23-124-109-196-262-196-242 0-305 171-305 415 1 245 61 427 304 427 151 0 248-77 267-215l190 12C1039-107 883 20 631 20" id="x"/><path fill="#3a414a" d="M914 0L548-499l-132 98V0H236v-1484h180v927l475-525h211L663-617 1125 0H914" id="y"/><g id="b"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#v"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#w"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#x"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#y"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#q"/></g><path fill="#3a414a" d="M162 0v-1349h919v156H353v422h668v154H353v461h769V0H162" id="z"/><path fill="#3a414a" d="M365-904c58-129 161-200 334-200 130 0 228 46 293 138s98 233 98 420c0 189-34 331-102 425S824 20 698 20c-170 0-275-64-336-184 0 55-3 116-9 164H179c5-68 6-147 6-223v-1261h180c-2 193 4 394-4 580h4zm283 791c221 0 256-197 256-427 0-229-34-425-254-425-236 0-285 195-285 441 0 237 53 411 283 411" id="A"/><path fill="#3a414a" d="M865-914c-3-187-2-380-2-570h180v1261c0 76 1 155 6 223H877c-8-49-9-116-10-174h-5C801-44 708 26 530 26c-135 0-234-46-297-139s-95-232-95-419c0-377 131-566 392-566 176 0 271 63 335 184zm-286-51c-222 0-255 197-255 427 0 229 31 425 253 425 237 0 286-195 286-441 0-238-52-411-284-411" id="B"/><path fill="#3a414a" d="M745-142h380V0H143v-142h422v-798H246v-142h499v940zM545-1292v-192h200v192H545" id="C"/><path fill="#3a414a" d="M1048-32c-2 300-135 456-433 456-222-1-358-88-400-267l184-25c22 99 100 157 222 156 184-2 248-125 248-315 0-64 3-133-2-194C807-100 706-13 524-12c-306 0-381-228-381-537 0-318 85-550 400-550 164 0 271 83 325 202h3c1-60 3-134 12-185h171c-13 339-4 702-6 1050zM585-145c210-8 284-178 284-406 0-192-52-331-177-392-33-16-69-22-104-22-223 2-259 184-259 414 0 229 31 415 256 406" id="D"/><g id="c"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#z"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#t"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#A"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#B"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#B"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,140.0227864583333,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,160.0260416666666,0)" xlink:href="#D"/></g><path fill="#3a414a" d="M622-1349c296 4 497 117 497 404 0 282-193 424-485 431H353V0H162v-1349h460zm-15 684c195-3 320-88 320-277 0-184-129-255-328-254H353v531h254" id="E"/><path fill="#3a414a" d="M682 16c-209 0-323-80-324-285v-671H190v-142h170l58-282h120v282h432v142H538v652c2 114 60 155 182 155 106 0 209-16 297-34v137C921-4 806 16 682 16" id="F"/><g id="d"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#E"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#q"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#F"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,140.0227864583333,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,160.0260416666666,0)" xlink:href="#o"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,180.02929687499991,0)" xlink:href="#w"/></g><g id="e"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#x"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#B"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,140.0227864583333,0)" xlink:href="#D"/></g><path fill="#3a414a" d="M237 0v-1349h191v1193h672V0H237" id="G"/><g id="f"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#G"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#o"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#n"/></g><path fill="#3a414a" d="M614-1226c-167 1-283 53-283 213 0 183 186 193 334 234 230 63 463 120 463 409 0 286-219 387-518 390C309 23 131-98 79-338l185-37c34 165 149 248 351 246 184-2 324-58 324-238 0-203-207-221-372-266-210-57-422-111-422-377 0-267 201-356 470-360 279-5 430 101 480 324l-188 33c-28-141-121-215-293-213" id="H"/><path fill="#3a414a" d="M932 0L611-444 288 0H94l415-556-397-526h199l300 421 298-421h201L713-558 1133 0H932" id="I"/><g id="g"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#H"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#r"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#F"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#t"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#o"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#I"/></g><path fill="#3a414a" d="M202-1349h823v156H709v1037h316V0H202v-156h316v-1037H202v-156" id="J"/><path fill="#3a414a" d="M698-1104c312 3 392 244 392 558 0 315-82 566-392 566-169 0-277-65-331-184h-5c8 188 2 394 4 589H185V-858c0-76-1-156-6-224h175c6 52 9 120 10 178h4c58-122 150-202 330-200zm-49 991c225 0 255-203 255-433 0-225-32-419-253-419-236 0-285 192-285 441 0 237 53 411 283 411" id="K"/><path fill="#3a414a" d="M528 20c-247 0-343-132-343-381v-721h180v686c-4 177 45 284 224 277 194-8 279-136 279-336v-627h181c3 360-6 732 6 1082H885c-4-54-7-126-8-185h-3C809-64 714 20 528 20" id="L"/><g id="h"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#J"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#K"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#L"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#F"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#q"/></g><path fill="#3a414a" d="M390 276c167 0 266-82 266-251v-965H249v-142h587V28c-4 270-165 397-431 397-107 0-208-16-288-43V242c80 18 173 34 273 34zm246-1568v-192h200v192H636" id="M"/><g id="i"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#E"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#n"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#M"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#x"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#F"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,140.0227864583333,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,160.0260416666666,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,180.02929687499991,0)" xlink:href="#p"/></g><path fill="#3a414a" d="M650-1214c-281 0-336 244-336 533 0 295 60 546 347 546 188 0 263-142 322-282l159 65C1058-155 935 20 659 20c-401 0-546-295-546-701 0-407 135-689 536-689 264 0 391 146 466 335l-168 65c-47-129-127-244-297-244" id="N"/><g id="j"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#N"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#B"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#F"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,140.0227864583333,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,160.0260416666666,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,180.02929687499991,0)" xlink:href="#o"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,200.03255208333323,0)" xlink:href="#w"/></g><path fill="#3a414a" d="M168 279c222 39 310-114 368-290L66-1082h192c120 299 249 590 362 896 115-301 235-597 351-896h190L705 0c-65 164-130 320-275 396-67 36-177 35-262 18V279" id="O"/><g id="k"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#B"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#p"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#q"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#C"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,100.01627604166663,0)" xlink:href="#F"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,120.01953124999996,0)" xlink:href="#O"/></g><g id="l"><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,0,0)" xlink:href="#t"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,20.003255208333325,0)" xlink:href="#s"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,40.00651041666665,0)" xlink:href="#B"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,60.00976562499997,0)" xlink:href="#u"/><use transform="matrix(0.01627604166666666,0,0,0.01627604166666666,80.0130208333333,0)" xlink:href="#w"/></g></defs></g></svg>
\ No newline at end of file
diff --git a/inference_script.sh b/inference_script.sh
new file mode 100755
index 00000000..62b3a2fb
--- /dev/null
+++ b/inference_script.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+# asynio flags
+export LDFLAGS="$LDFLAGS -L/usr/lib64/"
+export CFLAGS="$CFLAGS -I/usr/include/"
+# c++ libs
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/vgurev/.conda/envs/GPT/x86_64-conda-linux-gnu/lib/
+export PATH=/data/vgurev/.conda/envs/GPT/bin/:$PATH
+
+#use mpirun, not pytorch luncher
+export MPI=TRUE
+
+GPUS_PER_NODE=2
+NNODES=1
+export WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+python ./deepy.py generate-times.py 49M.yml
+
diff --git a/megatron/initialize.py b/megatron/initialize.py
index bc403264..8044530b 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -166,7 +166,8 @@ def _initialize_distributed(neox_args):
     # this does pipe on the most outside, then data, then model.
     # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
     topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
-
+    
+    
     # Offset base seeds for the interior pipeline stages.
     # TODO: adjust last stage too once IO is improved.
     stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
diff --git a/megatron/laggpt/__init__.py b/megatron/laggpt/__init__.py
new file mode 100644
index 00000000..e25e0d66
--- /dev/null
+++ b/megatron/laggpt/__init__.py
@@ -0,0 +1,3 @@
+from .transformer_envelope import TransformerEnvelope
+from .dataloaders import combined_dataset_iterator
+from .layers import ProjectionPipe
\ No newline at end of file
diff --git a/megatron/laggpt/augmentation.py b/megatron/laggpt/augmentation.py
new file mode 100644
index 00000000..c3379dfa
--- /dev/null
+++ b/megatron/laggpt/augmentation.py
@@ -0,0 +1,304 @@
+# modified from source code from https://github.com/vgurev/pytorch-transformer-ts/blob/main/lag-gpt/aug.py 
+# and https://github.com/vafl/gluon-ts/blob/ts_embeddings/src/gluonts/nursery/ts_embeddings/pt_augmentation.py
+
+
+
+import numpy as np
+import torch, random
+
+
+@torch.no_grad()
+def window_warp(x, window_ratio, scales):
+    """https://halshs.archives-ouvertes.fr/halshs-01357973/document"""
+
+    warp_scales = np.random.choice(scales, x.shape[0])
+    warp_size = np.ceil(window_ratio * x.shape[1]).astype(int)
+    window_steps = np.arange(warp_size)
+
+    window_starts = np.random.randint(
+        low=1, high=x.shape[1] - warp_size - 1, size=(x.shape[0])
+    ).astype(int)
+    window_ends = (window_starts + warp_size).astype(int)
+
+    ret = torch.zeros_like(x)
+    for i, pat in enumerate(x):
+        start_seg = pat[: window_starts[i]].cpu().numpy()
+        window_seg = np.interp(
+            np.linspace(
+                0,
+                warp_size - 1,
+                num=int(warp_size * warp_scales[i]),
+            ),
+            window_steps,
+            pat[window_starts[i] : window_ends[i]].cpu().numpy(),
+        )
+        end_seg = pat[window_ends[i]:].cpu().numpy()
+        warped = np.concatenate((start_seg, window_seg, end_seg))
+        warp = np.interp(
+            np.arange(x.shape[1]),
+            np.linspace(0, x.shape[1] - 1.0, num=warped.size),
+            warped,
+        )
+        ret[i] = torch.from_numpy(warp).float().to(x.device)
+    return ret
+
+
+
+@torch.no_grad()
+def window_slice(x, reduce_ratio):
+    """https://halshs.archives-ouvertes.fr/halshs-01357973/document"""
+
+    target_len = np.ceil(reduce_ratio * x.shape[1]).astype(int)
+    if target_len >= x.shape[1]:
+        return x
+    starts = np.random.randint(
+        low=0, high=x.shape[1] - target_len, size=(x.shape[0])
+    ).astype(int)
+    ends = (target_len + starts).astype(int)
+
+    ret = torch.zeros_like(x)
+    for i, pat in enumerate(x):
+        
+        warp = np.interp(
+            np.linspace(0, target_len, num=x.shape[1]),
+            np.arange(target_len),
+            pat[starts[i] : ends[i]].cpu().numpy(),
+        ).T
+        ret[i] = torch.from_numpy(warp).float().to(x.device)
+
+    return ret
+
+@torch.no_grad()
+def time_warp(x, sigma, knot):
+    from scipy.interpolate import CubicSpline
+
+    orig_steps = np.arange(x.shape[1])
+    random_warps = np.random.normal(
+        loc=1.0,
+        scale=sigma,
+        size=(x.shape[0], knot + 2),
+    )
+    warp_steps = np.linspace(0, x.shape[1] - 1.0, num=knot + 2)
+
+    ret = torch.zeros_like(x)
+    for i, pat in enumerate(x):
+        
+        time_warp = CubicSpline(
+            warp_steps,
+            warp_steps * random_warps[i],
+        )(orig_steps)
+        scale = (x.shape[1] - 1) / time_warp[-1]
+        wrap = np.interp(
+            orig_steps,
+            np.clip(scale * time_warp, 0, x.shape[1] - 1),
+            pat.cpu().numpy(),
+        ).T
+        ret[i] = torch.from_numpy(wrap).float().to(x.device)
+
+    return ret
+
+
+@torch.no_grad()
+def magnitude_warp(x, sigma, knot):
+    from scipy.interpolate import CubicSpline
+
+    orig_steps = np.arange(x.shape[1])
+
+    random_warps = np.random.normal(
+        loc=1.0,
+        scale=sigma,
+        size=(x.shape[0], knot + 2),
+    ) 
+    warp_steps = np.linspace(0, x.shape[1] - 1.0, num=knot + 2)
+    ret = torch.zeros_like(x)
+    for i, pat in enumerate(x):
+        warper = CubicSpline(warp_steps, random_warps[i])(orig_steps)
+        mean = torch.mean(pat, dim = -1, keepdim = True)
+        ret[i] = (pat - mean) * torch.from_numpy(warper).float().to(x.device) + mean
+
+    return ret
+
+
+@torch.no_grad()
+def permutation(x, max_segments, seg_mode = None):
+
+    orig_steps = np.arange(x.shape[1])
+    num_segs = np.random.randint(1, max_segments, size=(x.shape[0]))
+
+    ret = torch.zeros_like(x)
+    for i, pat in enumerate(x):
+        if num_segs[i] > 1:
+            
+            if seg_mode == "random":
+                split_points = np.random.choice(
+                    x.shape[1] - 2, num_segs[i] - 1, replace=False
+                )
+                split_points.sort()
+                splits = np.split(orig_steps, split_points)
+            elif seg_mode is None:
+                splits = np.array_split(orig_steps, num_segs[i])
+            else:
+                raise ValueError(f"seg_mod {seg_mode} is not supported by permutation augmentation.")
+            
+            random.shuffle(splits)
+            warp = np.concatenate(splits).ravel()
+            ret[i] = pat[warp]
+        else:
+            ret[i] = pat
+    return ret
+
+
+@torch.no_grad()
+def jitter(x, sigma):
+    '''
+    Gaussain noise scaled by std of the series
+    '''
+    std = torch.std(x, dim = -1)[..., np.newaxis]
+    return x + std * torch.normal(
+        mean=0.0, std=sigma, 
+        size=x.shape, device=x.device)
+
+
+@torch.no_grad()
+def rotation(x):
+    flip_index = torch.multinomial(
+            torch.tensor([0.5, 0.5], dtype=x.dtype, device=x.device),
+            num_samples=x.shape[0],replacement=True,
+    )
+    ones = torch.ones((x.shape[0]), device=x.device)
+    flip = torch.where(flip_index == 0, -ones, ones)
+    return flip[..., np.newaxis] * x
+
+
+@torch.no_grad()
+def freq_mask(xy, rate=0.1, dim=1):
+    
+    xy_copy = xy
+    xy_f = torch.fft.rfft(xy, dim=dim)
+    m = torch.empty(xy_f.shape, dtype = xy.dtype).uniform_() < rate
+
+    freal = xy_f.real.masked_fill(m, 0)
+    fimag = xy_f.imag.masked_fill(m, 0)
+    xy_f = torch.complex(freal, fimag)
+    xy = torch.fft.irfft(xy_f, dim=dim)
+
+    if xy_copy.shape[dim] != xy.shape[dim]:
+        xy = torch.cat([xy_copy[:, 0:1, ...], xy], dim)
+
+    return xy
+
+
+@torch.no_grad()
+def freq_mix(xy, rate=0.1, dim=1):
+    
+    xy_copy = xy
+    xy_f = torch.fft.rfft(xy, dim=dim)
+
+    m = torch.empty(xy_f.shape, dtype = xy.dtype).uniform_() < rate
+    amp = abs(xy_f)
+    _, index = amp.sort(dim=dim, descending=True)
+    dominant_mask = index > 2
+    m = torch.bitwise_and(m, dominant_mask)
+    freal = xy_f.real.masked_fill(m, 0)
+    fimag = xy_f.imag.masked_fill(m, 0)
+
+    b_idx = np.arange(xy.shape[0])
+    np.random.shuffle(b_idx)
+    xy2 = xy_copy[b_idx]
+    xy2_f = torch.fft.rfft(xy2, dim=dim)
+
+    m = torch.bitwise_not(m)
+    freal2 = xy2_f.real.masked_fill(m, 0)
+    fimag2 = xy2_f.imag.masked_fill(m, 0)
+
+    freal += freal2
+    fimag += fimag2
+
+    xy_f = torch.complex(freal, fimag)
+    xy = torch.fft.irfft(xy_f, dim=dim)
+
+    if xy_copy.shape[dim] != xy.shape[dim]:
+        xy = torch.cat([xy_copy[:, 0:1, ...], xy], 1)
+
+    return xy
+
+
+
+augmentation_map = {
+    "freq_mask": freq_mask,
+    "freq_mix": freq_mix,
+    "jitter": jitter,
+    "rotation": rotation,
+    "permutation": permutation,
+    "magnitude_warp": magnitude_warp,
+    "time_warp": time_warp,
+    "window_slice": window_slice,
+    "window_warp": window_warp,
+}
+
+
+def augmentaiton_factory(opt):
+
+    augmentations = []
+    values = []
+    index = 0
+    for key, value in opt.items():
+        aug = augmentation_map[key]
+        weight = value["weight"]
+        options = value.get("options", {})
+
+        augmentations.append((aug, options))
+        values.append(weight)
+
+    values = np.array(values)
+    weights = values / np.sum(values)
+    return augmentations, weights
+
+
+class AugmentationIterator:
+    '''
+    Transformation of iterator that apply with probability "prob" augmentations.
+    Below an example of opt parameter.
+    {
+        "prob": 1.0,
+        "transforms": {
+            "freq_mask": {
+                "weight": 0.5,
+                "options": {
+                    "rate": 0.01
+                }
+            },
+            "freq_mix": {
+                "weight": 0.5,
+                "options": {
+                    "rate": 0.01
+                }
+            }
+        }
+    }
+    "options" are parameters to agmentation functions
+    '''
+    def __init__(self, opt, iterator):
+        self.iterator = iterator
+        self.prob = opt["prob"]
+        transforms = opt["transforms"]
+        self.augmentations, self.weights = augmentaiton_factory(transforms)
+
+
+    def __next__(self):
+
+        batch = next(self.iterator)
+        
+        if random.random() < self.prob:
+            x, y = batch["past_target"], batch["future_target"]
+            prev = batch["past_target"]
+            x_len, y_len = x.shape[-1], y.shape[-1]
+            xy = torch.cat([x, y], dim = -1)
+
+            aug_index = np.random.choice(np.arange(len(self.weights)), size = 1, p=self.weights)
+            augmentation, pars = self.augmentations[aug_index[0]]
+            transformed_batch = augmentation(xy, **pars)
+            batch["past_target"], batch["future_target"] = torch.split(transformed_batch, [x_len, y_len], dim = -1)
+            
+        return batch
+
diff --git a/megatron/laggpt/buffer_iterator.py b/megatron/laggpt/buffer_iterator.py
new file mode 100644
index 00000000..487789b8
--- /dev/null
+++ b/megatron/laggpt/buffer_iterator.py
@@ -0,0 +1,137 @@
+
+import torch
+from megatron import mpu, print_rank_0
+from .augmentation import AugmentationIterator
+
+
+from megatron.mpu import (
+    get_model_parallel_rank, 
+    get_pipe_parallel_rank, get_pipe_parallel_world_size, 
+    broadcast_data_first_last_layer_group
+)
+
+
+#from .datasets import get_combined_dataset
+#from .dataloaders import get_train_valid_test_dataloaders
+from megatron.mpu.data import broadcast_data_first_last_layer_group, broadcast_keys_first_last_layer_group
+
+
+def buffer_broadcast_iterator(iterator, size, src):
+    pipe_rank = get_pipe_parallel_rank()
+
+    need_data = (pipe_rank == 0 or pipe_rank == get_pipe_parallel_world_size() - 1)
+    if not need_data:
+        return None
+    return BufferBroadcastIterator(iterator, size, src)
+
+
+class BufferBroadcastIterator:
+    """
+    Deepspeed pipeline requires data batches on the first and last stages.
+    We broadcast batch from the first pipeline stage at first model rank to all model ranks at first and last stages for each data group.
+    To avoid interference with DeepSpeed communications, data is buffered in the beginning of training, validation, testing iterations.
+    """
+    def __init__(self, iterator, size, src):
+        self.size = size
+        self.iterator = iterator
+        self.index  = 0
+        self.src = src
+
+        data, keys = None, None
+        
+        if self.src:
+            data = next(self.iterator)
+            keys = list(data.keys())
+        self.keys = broadcast_keys_first_last_layer_group(keys)
+        
+        data = broadcast_data_first_last_layer_group(self.keys, data, datatype=torch.float32, src = self.src)
+        self.buffer = self._update_buffer(add_data=True, data = data)
+
+
+    def _update_buffer(self, add_data = False, data = None):
+    
+        buffer, size = ([data], self.size - 1) if add_data else ([], self.size)   
+        
+        for _ in range(size):
+            data = None
+            if self.src:
+                data = next(self.iterator)
+                if data is None:
+                    raise ValueError(f"Data is None. {self.src} and {self.size} and {self.index}")
+                
+            
+            data = broadcast_data_first_last_layer_group(self.keys, data, datatype=torch.float32, src = self.src)
+            buffer.append(data)
+        return buffer
+
+
+    def update_buffer(self):
+        if self.index == self.size:
+            self.buffer = self._update_buffer()
+            self.index = 0
+
+
+    def __iter__(self):
+        return self
+
+
+    def __next__(self):
+        batch = self.buffer[self.index]
+        self.index += 1
+        if batch is None:
+            raise ValueError(f"Batch is None. {self.src} and {self.size} and {self.index}")
+        return batch
+
+
+def get_iterator(data):
+    return iter(data) if data else None
+
+def buffer_train_valid_test_data_iterators(neox_args, dataloaders):
+    """
+    Convert dataloaders to iterators adding broadcasting-buffering (see BufferBroadcastIterator) and augmentation 
+    """
+
+    print_rank_0("> building train, validation, and test datasets ...")
+
+    src = (mpu.get_pipe_parallel_rank() == 0 and mpu.get_model_parallel_rank() == 0)
+    train, valid, test = dataloaders(neox_args) if src else (None, None, None)
+    train, valid, test = get_iterator(train), get_iterator(valid), get_iterator(test)
+
+    dataset_opt = neox_args.times_args["datasets"]
+    if train and ("augmentation" in dataset_opt) and dataset_opt["augmentation"]["enabled"]:
+        print_rank_0("> data_augmentation set ...")
+        train = AugmentationIterator(dataset_opt["augmentation"], train) 
+
+    train_buffer_size = neox_args.gradient_accumulation_steps * neox_args.train_micro_batch_size_per_gpu
+    validation_buffer_size = neox_args.gradient_accumulation_steps * neox_args.train_micro_batch_size_per_gpu
+
+    train = buffer_broadcast_iterator(train, train_buffer_size, src) 
+    valid = buffer_broadcast_iterator(valid, validation_buffer_size, src)
+    
+
+    do_train = train is not None and neox_args.train_iters > 0
+    do_valid = valid is not None and neox_args.eval_iters > 0
+    do_test = test is not None and neox_args.eval_iters > 0
+
+    flags = torch.cuda.LongTensor([int(do_train), int(do_valid), int(do_test)])
+    torch.distributed.broadcast(flags, src=0)
+
+    neox_args.do_train = flags[0].item()
+    neox_args.do_valid = flags[1].item()
+    neox_args.do_test = flags[2].item()
+
+    print_rank_0("> building train, validation, and test datasets done ...")
+
+    return train, valid, test
+
+
+def buffer_test_data_iterator(neox_args, dataloader):
+    print_rank_0("> building test datasets ...")
+
+    src = (mpu.get_pipe_parallel_rank() == 0 and mpu.get_model_parallel_rank() == 0)
+    test = dataloader(neox_args) if src else None
+    test = get_iterator(test)
+
+    buffer_size = 1
+    test = buffer_broadcast_iterator(test, buffer_size, src) 
+    return test
diff --git a/megatron/laggpt/dataloaders.py b/megatron/laggpt/dataloaders.py
new file mode 100644
index 00000000..37bea6e3
--- /dev/null
+++ b/megatron/laggpt/dataloaders.py
@@ -0,0 +1,253 @@
+from gluonts.itertools import Cyclic
+from typing import Optional, Iterable, Dict, Any
+from functools import partial
+
+import torch
+import numpy as np
+import megatron.mpu as mpu
+import random
+
+from .datasets import get_combined_dataset
+from .buffer_iterator import buffer_train_valid_test_data_iterators, buffer_test_data_iterator
+
+
+from gluonts.dataset.loader import as_stacked_batches
+from gluonts.dataset.common import Dataset
+from gluonts.dataset.field_names import FieldName
+from gluonts.itertools import Cyclic
+
+
+from gluonts.dataset.repository.datasets import get_dataset
+from gluonts.transform import (
+    Chain,
+    Transformation,
+    ValidationSplitSampler,
+    TestSplitSampler,
+    AddObservedValuesIndicator,
+    ExpectedNumInstanceSampler,
+    DummyValueImputation,
+    InstanceSampler,
+    InstanceSplitter,
+)
+
+
+PREDICTION_INPUT_NAMES = ["past_target", "past_observed_values"]
+TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
+    "future_target",
+    "future_observed_values",
+]
+
+def create_instance_splitter(
+        sampler, prediction_length, past_length, padding_value):
+        
+    return InstanceSplitter(
+        target_field=FieldName.TARGET,
+        is_pad_field=FieldName.IS_PAD,
+        start_field=FieldName.START,
+        forecast_start_field=FieldName.FORECAST_START,
+        instance_sampler=sampler,
+        past_length=past_length,
+        future_length=prediction_length,
+        time_series_fields=[FieldName.OBSERVED_VALUES],
+        dummy_value=padding_value,
+    )
+
+
+def create_data_loader(
+        data: Dataset, 
+        batch_size, 
+        num_batches_per_epoch,
+        field_names,
+        shuffle_buffer_length: Optional[int] = None):
+        
+    batches = as_stacked_batches(
+        data,
+        batch_size=batch_size,
+        shuffle_buffer_length=shuffle_buffer_length,
+        field_names=field_names,
+        output_type=torch.tensor,
+        num_batches_per_epoch=num_batches_per_epoch,
+    )
+    return batches
+
+
+def get_training_data_loader(
+        data: Dataset, batch_size, num_batches_per_epoch, shuffle_buffer_length, 
+        past_length, prediction_length, padding_value):
+    
+    transform = AddObservedValuesIndicator(
+        target_field=FieldName.TARGET,
+        output_field=FieldName.OBSERVED_VALUES,
+        imputation_method=DummyValueImputation(0.0))
+    
+    data = transform.apply(data, is_train=True)
+    
+    sampler = ExpectedNumInstanceSampler(num_instances=1.0, min_future=prediction_length)
+    instance_splitter = create_instance_splitter(sampler, prediction_length, past_length, padding_value)
+    data = instance_splitter.apply(Cyclic(data).stream(), is_train=True)
+    
+    data_loader = create_data_loader(
+        data, batch_size, 
+        num_batches_per_epoch, TRAINING_INPUT_NAMES, shuffle_buffer_length)
+    return data_loader
+
+
+def get_validation_data_loader(
+        data: Dataset, batch_size, num_batches_per_epoch, shuffle_buffer_length, 
+        past_length, prediction_length, padding_value):
+    
+    transform = AddObservedValuesIndicator(
+        target_field=FieldName.TARGET,
+        output_field=FieldName.OBSERVED_VALUES,
+        imputation_method=DummyValueImputation(0.0))
+    
+    data = transform.apply(data, is_train=True)
+    
+    sampler = ValidationSplitSampler(min_future=prediction_length) 
+    instance_splitter = create_instance_splitter(sampler, prediction_length, past_length, padding_value)
+    data = instance_splitter.apply(Cyclic(data).stream(), is_train=True)
+    
+    data_loader = create_data_loader(
+        data, batch_size, 
+        num_batches_per_epoch, TRAINING_INPUT_NAMES, shuffle_buffer_length)
+    return data_loader
+
+
+def get_test_gluonts_dataloader(
+        data: Dataset, batch_size, num_batches_per_epoch, shuffle_buffer_length, 
+        past_length, prediction_length, padding_value):
+    
+    transform = AddObservedValuesIndicator(
+        target_field=FieldName.TARGET,
+        output_field=FieldName.OBSERVED_VALUES,
+        imputation_method=DummyValueImputation(0.0))
+    
+    data = transform.apply(data, is_train=True)
+    
+    sampler = ValidationSplitSampler(min_future=prediction_length) 
+    instance_splitter = create_instance_splitter(sampler, prediction_length, past_length, padding_value)
+    data = instance_splitter.apply(Cyclic(data).stream(), is_train=True)
+    
+    data_loader = create_data_loader(
+        data, batch_size, 
+        num_batches_per_epoch, TRAINING_INPUT_NAMES, shuffle_buffer_length)
+    return data_loader
+
+
+
+
+def get_train_valid_test_dataloaders(neox_args, train_dataset, validation_dataset):
+    """
+    Creates dataloaders from the datasets. Preprocessing time-series: samples with instance samplers, 
+    create new fields and flags. 
+    """
+    
+    times_args = neox_args.times_args
+
+    shuffle_buffer_length = times_args["shuffle_buffer_length"]
+    prediction_length = times_args["prediction_length"]
+    padding_value = times_args["padding_value"]
+    past_length = times_args["past_length"]
+    
+    batch_size = neox_args.train_micro_batch_size_per_gpu
+    num_batches_per_epoch = neox_args.train_iters * neox_args.gradient_accumulation_steps
+    
+
+    training_data_loader = get_training_data_loader(
+        train_dataset, batch_size, num_batches_per_epoch, shuffle_buffer_length, 
+        past_length, prediction_length, padding_value)
+    
+    validation_data_loader = get_training_data_loader(
+        validation_dataset, batch_size, num_batches_per_epoch, shuffle_buffer_length, 
+        past_length, prediction_length, padding_value)
+    
+    return training_data_loader, validation_data_loader, None
+    
+    
+def combined_dataset_iterator(neox_args):
+    """
+    The central function to create a dataset iterator for training from the list of GluonTS datasets.
+    Gets combined dataset from GluonTS create dataloaders and creates iterator from dataloaders.
+    """
+    src = (mpu.get_pipe_parallel_rank() == 0 and mpu.get_model_parallel_rank() == 0)
+    times_args = neox_args.times_args    
+    datasets = times_args["datasets"]
+    train_datasets = datasets["train"]
+    validation_datasets = datasets["validation"]
+    
+    preload_datasets(train_datasets + validation_datasets)
+
+    if src:
+        rank = mpu.get_data_parallel_rank()
+        iteration_index = neox_args.iteration
+        data_seed = times_args["data_seed"] + iteration_index
+        np.random.seed(data_seed)
+        random.seed(data_seed)
+        train_datasets = get_combined_dataset(train_datasets, rank, data_seed)    
+        validation_datasets = get_combined_dataset(validation_datasets, rank, data_seed)
+
+        dataloaders = partial(get_train_valid_test_dataloaders, train_dataset = train_datasets, validation_dataset = validation_datasets)
+    else:
+        dataloaders = None
+
+    train, validation, test = buffer_train_valid_test_data_iterators(neox_args, dataloaders)
+    #TODO Test is not supported yet
+    return train, validation, test
+
+
+def get_test_dataloader(neox_args, dataset):
+    """
+    Dataloader for test stage
+    """
+    times_args = neox_args.times_args
+
+    shuffle_buffer_length = times_args["shuffle_buffer_length"]
+    prediction_length = times_args["prediction_length"]
+    padding_value = times_args["padding_value"]
+    past_length = times_args["past_length"]
+    n_batches = times_args["inference"]["num_test_batches"]
+    
+    batch_size = neox_args.train_micro_batch_size_per_gpu
+    
+    test_dataloader = get_test_gluonts_dataloader(
+        dataset, batch_size, n_batches, shuffle_buffer_length, 
+        past_length, prediction_length, padding_value)
+    
+    return test_dataloader
+    
+def preload_datasets(datasets):
+
+    if torch.distributed.get_rank() == 0:
+        for i in datasets:
+            get_dataset(i)
+
+    torch.distributed.barrier()
+
+def combined_test_dataset_iterator(neox_args):
+    """
+    The central function to create a dataset iterator for testing from the list of GluonTS datasets.
+    Gets combined dataset from GluonTS create dataloaders and creates iterator from dataloaders.
+    """
+
+    src = (mpu.get_pipe_parallel_rank() == 0 and mpu.get_model_parallel_rank() == 0)
+    times_args = neox_args.times_args
+    test_datasets = times_args["datasets"]["test"]
+    preload_datasets(test_datasets)
+
+    if src:
+        rank = mpu.get_data_parallel_rank()
+        dataset = get_combined_dataset(test_datasets, rank, times_args["data_seed"], test = True)
+        dataloader = partial(get_test_dataloader, dataset = dataset)
+    else:
+        dataloader = None
+
+    test = buffer_test_data_iterator(neox_args, dataloader)
+    return test
+
+
+
+
+
+
+
+
diff --git a/megatron/laggpt/datasets.py b/megatron/laggpt/datasets.py
new file mode 100644
index 00000000..db385346
--- /dev/null
+++ b/megatron/laggpt/datasets.py
@@ -0,0 +1,52 @@
+from gluonts.dataset.repository.datasets import get_dataset as get_gluonts_dataset
+import random
+
+
+class CombinedDatasetIterator:
+    def __init__(self, datasets, seed, weights):
+        self._datasets = [iter(el) for el in datasets]
+        self._weights = weights.copy()
+        self._rng = random.Random(seed)
+
+    def __next__(self):
+        
+        data = None
+        while not data:
+            (index, ) = self._rng.choices(range(len(self._datasets)), weights=self._weights, k=1)
+            try:
+                data = next(self._datasets[index])
+            except StopIteration:
+                del self._datasets[index]
+                del self._weights[index]
+            if len(self._datasets) == 0:
+                raise StopIteration
+        
+        return data
+    
+
+class CombinedDataset:
+    def __init__(self, datasets, seed=None, weights=None):
+        self._seed = seed
+        self._datasets = datasets
+        self._weights = weights
+        n_datasets = len(datasets)
+        if weights is None:
+            self._weights = [1 / n_datasets] * n_datasets
+
+    def __iter__(self):
+        return CombinedDatasetIterator(self._datasets, self._seed, self._weights)
+    
+    def __len__(self):
+        return sum([len(ds) for ds in self._datasets])
+    
+
+
+
+def get_combined_dataset(dataset_names, rank, seed, test = False):
+
+    if test:
+        gluont_ds = [get_gluonts_dataset(i).test for i in dataset_names]
+    else:
+        gluont_ds = [get_gluonts_dataset(i).train for i in dataset_names]
+    return CombinedDataset(gluont_ds, seed=rank + seed)
+
diff --git a/megatron/laggpt/gluontstorch/__init__.py b/megatron/laggpt/gluontstorch/__init__.py
new file mode 100644
index 00000000..20b3421f
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/__init__.py
@@ -0,0 +1,4 @@
+'''
+Modules in this folder contains code from GluonTS https://github.com/awslabs/gluonts. 
+Classes and functions from GluonTS Torch modules could not be imported directly from GluonTS library due to dependency on Pytorch Lightning. 
+'''
\ No newline at end of file
diff --git a/megatron/laggpt/gluontstorch/affine_transformed.py b/megatron/laggpt/gluontstorch/affine_transformed.py
new file mode 100644
index 00000000..f0c6de4a
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/affine_transformed.py
@@ -0,0 +1,67 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from torch.distributions import (
+    AffineTransform,
+    Distribution,
+    TransformedDistribution,
+)
+
+
+class AffineTransformed(TransformedDistribution):
+    """
+    Represents the distribution of an affinely transformed random variable.
+
+    This is the distribution of ``Y = scale * X + loc``, where ``X`` is a
+    random variable distributed according to ``base_distribution``.
+
+    Parameters
+    ----------
+    base_distribution
+        Original distribution
+    loc
+        Translation parameter of the affine transformation.
+    scale
+        Scaling parameter of the affine transformation.
+    """
+
+    def __init__(self, base_distribution: Distribution, loc=None, scale=None):
+        self.scale = 1.0 if scale is None else scale
+        self.loc = 0.0 if loc is None else loc
+
+        super().__init__(
+            base_distribution, [AffineTransform(self.loc, self.scale)]
+        )
+
+    @property
+    def mean(self):
+        """
+        Returns the mean of the distribution.
+        """
+        return self.base_dist.mean * self.scale + self.loc
+
+    @property
+    def variance(self):
+        """
+        Returns the variance of the distribution.
+        """
+        return self.base_dist.variance * self.scale**2
+
+    @property
+    def stddev(self):
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+
diff --git a/megatron/laggpt/gluontstorch/distribution_output.py b/megatron/laggpt/gluontstorch/distribution_output.py
new file mode 100644
index 00000000..54c71e27
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/distribution_output.py
@@ -0,0 +1,263 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from typing import Callable, Dict, Optional, Tuple, Type
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Beta, Distribution, Gamma, Normal, Poisson
+
+from gluonts.core.component import validated
+from .affine_transformed import AffineTransformed
+
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, function):
+        super().__init__()
+        self._func = function
+
+    def forward(self, x, *args):
+        return self._func(x, *args)
+
+
+class PtArgProj(nn.Module):
+    r"""
+    A PyTorch module that can be used to project from a dense layer
+    to PyTorch distribution arguments.
+
+    Parameters
+    ----------
+    in_features
+        Size of the incoming features.
+    dim_args
+        Dictionary with string key and int value
+        dimension of each arguments that will be passed to the domain
+        map, the names are not used.
+    domain_map
+        Function returning a tuple containing one tensor
+        a function or a nn.Module. This will be called with num_args
+        arguments and should return a tuple of outputs that will be
+        used when calling the distribution constructor.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        args_dim: Dict[str, int],
+        domain_map: Callable[..., Tuple[torch.Tensor]],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.args_dim = args_dim
+        self.proj = nn.ModuleList(
+            [nn.Linear(in_features, dim) for dim in args_dim.values()]
+        )
+        self.domain_map = domain_map
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        params_unbounded = [proj(x) for proj in self.proj]
+
+        return self.domain_map(*params_unbounded)
+
+
+class Output:
+    """
+    Class to connect a network to some output.
+    """
+
+    in_features: int
+    args_dim: Dict[str, int]
+    _dtype: Type = np.float32
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @dtype.setter
+    def dtype(self, dtype: Type):
+        self._dtype = dtype
+
+    def get_args_proj(self, in_features: int) -> nn.Module:
+        return PtArgProj(
+            in_features=in_features,
+            args_dim=self.args_dim,
+            domain_map=LambdaLayer(self.domain_map),
+        )
+
+    def domain_map(self, *args: torch.Tensor):
+        raise NotImplementedError()
+
+
+class DistributionOutput(Output):
+    r"""
+    Class to construct a distribution given the output of a network.
+    """
+
+    distr_cls: type
+
+    @validated()
+    def __init__(self) -> None:
+        pass
+
+    def _base_distribution(self, distr_args):
+        return self.distr_cls(*distr_args)
+
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        r"""
+        Construct the associated distribution, given the collection of
+        constructor arguments and, optionally, a scale tensor.
+
+        Parameters
+        ----------
+        distr_args
+            Constructor arguments for the underlying Distribution type.
+        loc
+            Optional tensor, of the same shape as the
+            batch_shape+event_shape of the resulting distribution.
+        scale
+            Optional tensor, of the same shape as the
+            batch_shape+event_shape of the resulting distribution.
+        """
+        distr = self._base_distribution(distr_args)
+        if loc is None and scale is None:
+            return distr
+        else:
+            return AffineTransformed(distr, loc=loc, scale=scale)
+
+    @property
+    def event_shape(self) -> Tuple:
+        r"""
+        Shape of each individual event contemplated by the distributions
+        that this object constructs.
+        """
+        raise NotImplementedError()
+
+    @property
+    def event_dim(self) -> int:
+        r"""
+        Number of event dimensions, i.e., length of the `event_shape` tuple,
+        of the distributions that this object constructs.
+        """
+        return len(self.event_shape)
+
+    @property
+    def value_in_support(self) -> float:
+        r"""
+        A float that will have a valid numeric value when computing the
+        log-loss of the corresponding distribution. By default 0.0.
+        This value will be used when padding data series.
+        """
+        return 0.0
+
+    def domain_map(self, *args: torch.Tensor):
+        r"""
+        Converts arguments to the right shape and domain. The domain depends
+        on the type of distribution, while the correct shape is obtained by
+        reshaping the trailing axis in such a way that the returned tensors
+        define a distribution of the right event_shape.
+        """
+        raise NotImplementedError()
+
+
+class NormalOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    distr_cls: type = Normal
+
+    @classmethod
+    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
+        scale = F.softplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
+
+
+class BetaOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"concentration1": 1, "concentration0": 1}
+    distr_cls: type = Beta
+
+    @classmethod
+    def domain_map(
+        cls, concentration1: torch.Tensor, concentration0: torch.Tensor
+    ):
+        epsilon = np.finfo(cls._dtype).eps  # machine epsilon
+        concentration1 = F.softplus(concentration1) + epsilon
+        concentration0 = F.softplus(concentration0) + epsilon
+        return concentration1.squeeze(dim=-1), concentration0.squeeze(dim=-1)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
+
+    @property
+    def value_in_support(self) -> float:
+        return 0.5
+
+
+class GammaOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"concentration": 1, "rate": 1}
+    distr_cls: type = Gamma
+
+    @classmethod
+    def domain_map(cls, concentration: torch.Tensor, rate: torch.Tensor):
+        epsilon = np.finfo(cls._dtype).eps  # machine epsilon
+        concentration = F.softplus(concentration) + epsilon
+        rate = F.softplus(rate) + epsilon
+        return concentration.squeeze(dim=-1), rate.squeeze(dim=-1)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
+
+    @property
+    def value_in_support(self) -> float:
+        return 0.5
+
+
+class PoissonOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"rate": 1}
+    distr_cls: type = Poisson
+
+    @classmethod
+    def domain_map(cls, rate: torch.Tensor):
+        rate_pos = F.softplus(rate).clone()
+        return (rate_pos.squeeze(-1),)
+
+    # Overwrites the parent class method. We cannot scale using the affine
+    # transformation since Poisson should return integers. Instead we scale
+    # the parameters.
+    def distribution(
+        self,
+        distr_args,
+        loc: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
+    ) -> Distribution:
+        (rate,) = distr_args
+
+        if scale is not None:
+            rate *= scale
+
+        return Poisson(rate=rate)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
diff --git a/megatron/laggpt/gluontstorch/loss.py b/megatron/laggpt/gluontstorch/loss.py
new file mode 100644
index 00000000..aad9e096
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/loss.py
@@ -0,0 +1,73 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import torch
+from pydantic import BaseModel
+
+
+class DistributionLoss(BaseModel):
+    """
+    A ``torch.nn.Module`` extensions that computes loss values by comparing a
+    ``Distribution`` (prediction) to a ``Tensor`` (ground-truth).
+    """
+
+    def __call__(
+        self, input: torch.distributions.Distribution, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute the loss of predicting ``target`` with the ``input``
+        distribution.
+
+        Parameters
+        ----------
+        input
+            Distribution object representing the prediction.
+        target
+            Tensor containing the ground truth.
+
+        Returns
+        -------
+        torch.Tensor
+            Tensor containing loss values, with the same shape as ``target``.
+
+        Raises
+        ------
+        NotImplementedError
+            [description]
+        """
+        raise NotImplementedError
+
+
+class NegativeLogLikelihood(DistributionLoss):
+    """
+    Compute the negative log likelihood loss.
+
+    Parameters
+    ----------
+    beta: float in range (0, 1)
+        beta parameter from the paper: "On the Pitfalls of Heteroscedastic
+        Uncertainty Estimation with Probabilistic Neural Networks" by
+        Seitzer et al. 2022
+        https://openreview.net/forum?id=aPOpXlnV1T
+    """
+
+    beta: float = 0.0
+
+    def __call__(
+        self, input: torch.distributions.Distribution, target: torch.Tensor
+    ) -> torch.Tensor:
+        nll = -input.log_prob(target)
+        if self.beta > 0.0:
+            variance = input.variance
+            nll = nll * (variance.detach() ** self.beta)
+        return nll
diff --git a/megatron/laggpt/gluontstorch/scaler.py b/megatron/laggpt/gluontstorch/scaler.py
new file mode 100644
index 00000000..38f61dee
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/scaler.py
@@ -0,0 +1,176 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from __future__ import annotations
+from typing import Optional
+
+import torch
+
+from gluonts.core.component import validated
+
+
+class Scaler:
+    def __call__(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+
+class MeanScaler(Scaler):
+    """
+    Computes a scaling factor as the weighted average absolute value along
+    dimension ``dim``, and scales the data accordingly.
+
+    Parameters
+    ----------
+    dim
+        dimension along which to compute the scale
+    keepdim
+        controls whether to retain dimension ``dim`` (of length 1) in the
+        scale tensor, or suppress it.
+    default_scale
+        default scale that is used for elements that are constantly zero
+    minimum_scale
+        minimum possible scale that is used for any item.
+    """
+
+    @validated()
+    def __init__(
+        self,
+        dim: int = -1,
+        keepdim: bool = False,
+        default_scale: Optional[float] = None,
+        minimum_scale: float = 1e-10,
+    ) -> None:
+        self.dim = dim
+        self.keepdim = keepdim
+        self.default_scale = default_scale
+        self.minimum_scale = minimum_scale
+
+    def __call__(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(
+            num_observed > 0,
+            scale,
+            default_scale,
+        )
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        loc = torch.zeros_like(scale)
+
+        return scaled_data, loc, scale
+
+
+class NOPScaler(Scaler):
+    """
+    Assigns a scaling factor equal to 1 along dimension ``dim``, and therefore
+    applies no scaling to the input data.
+
+    Parameters
+    ----------
+    dim
+        dimension along which to compute the scale
+    keepdim
+        controls whether to retain dimension ``dim`` (of length 1) in the
+        scale tensor, or suppress it.
+    """
+
+    @validated()
+    def __init__(
+        self,
+        dim: int = -1,
+        keepdim: bool = False,
+    ) -> None:
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def __call__(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data).mean(
+            dim=self.dim,
+            keepdim=self.keepdim,
+        )
+        loc = torch.zeros_like(scale)
+        return data, loc, scale
+
+
+class StdScaler(Scaler):
+    """
+    Computes a std scaling  value along dimension ``dim``, and scales the data accordingly.
+
+    Parameters
+    ----------
+    dim
+        dimension along which to compute the scale
+    keepdim
+        controls whether to retain dimension ``dim`` (of length 1) in the
+        scale tensor, or suppress it.
+    minimum_scale
+        default scale that is used for elements that are constantly zero
+        along dimension ``dim``.
+    """
+
+    @validated()
+    def __init__(
+        self,
+        dim: int = -1,
+        keepdim: bool = False,
+        minimum_scale: float = 1e-5,
+    ) -> None:
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    def __call__(
+        self, data: torch.Tensor, weights: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert (
+            data.shape == weights.shape
+        ), "data and weights must have same shape"
+        with torch.no_grad():
+            denominator = weights.sum(self.dim, keepdim=self.keepdim)
+            denominator = denominator.clamp_min(1.0)
+            loc = (data * weights).sum(
+                self.dim, keepdim=self.keepdim
+            ) / denominator
+
+            variance = (((data - loc) * weights) ** 2).sum(
+                self.dim, keepdim=self.keepdim
+            ) / denominator
+            scale = torch.sqrt(variance + self.minimum_scale)
+            return (data - loc) / scale, loc, scale
diff --git a/megatron/laggpt/gluontstorch/studentt.py b/megatron/laggpt/gluontstorch/studentt.py
new file mode 100644
index 00000000..44dae436
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/studentt.py
@@ -0,0 +1,77 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from typing import Dict, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from scipy.stats import t as ScipyStudentT
+from torch.distributions import StudentT as TorchStudentT
+
+from .distribution_output import DistributionOutput
+from gluonts.util import lazy_property
+
+
+class StudentT(TorchStudentT):
+    """Student's t-distribution parametrized by degree of freedom `df`,
+    mean `loc` and scale `scale`.
+
+    Based on torch.distributions.StudentT, with added `cdf` and `icdf` methods.
+    """
+
+    def __init__(
+        self,
+        df: Union[float, torch.Tensor],
+        loc: Union[float, torch.Tensor] = 0.0,
+        scale: Union[float, torch.Tensor] = 1.0,
+        validate_args=None,
+    ):
+        super().__init__(
+            df=df, loc=loc, scale=scale, validate_args=validate_args
+        )
+
+    def cdf(self, value: torch.Tensor) -> torch.Tensor:
+        if self._validate_args:
+            self._validate_sample(value)
+        result = self.scipy_student_t.cdf(value.detach().cpu().numpy())
+        return torch.tensor(result, device=value.device, dtype=value.dtype)
+
+    def icdf(self, value: torch.Tensor) -> torch.Tensor:
+        result = self.scipy_student_t.ppf(value.detach().cpu().numpy())
+        return torch.tensor(result, device=value.device, dtype=value.dtype)
+
+    @lazy_property
+    def scipy_student_t(self):
+        return ScipyStudentT(
+            df=self.df.detach().cpu().numpy(),
+            loc=self.loc.detach().cpu().numpy(),
+            scale=self.scale.detach().cpu().numpy(),
+        )
+
+
+class StudentTOutput(DistributionOutput):
+    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    distr_cls: type = StudentT
+
+    @classmethod
+    def domain_map(
+        cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor
+    ):
+        epsilon = torch.finfo(scale.dtype).eps
+        scale = F.softplus(scale).clamp_min(epsilon)
+        df = 2.0 + F.softplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def event_shape(self) -> Tuple:
+        return ()
diff --git a/megatron/laggpt/gluontstorch/utils.py b/megatron/laggpt/gluontstorch/utils.py
new file mode 100644
index 00000000..6c8236cc
--- /dev/null
+++ b/megatron/laggpt/gluontstorch/utils.py
@@ -0,0 +1,235 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from typing import List, Optional, Type
+import inspect
+
+import torch
+
+
+def copy_parameters(
+    net_source: torch.nn.Module,
+    net_dest: torch.nn.Module,
+    strict: Optional[bool] = True,
+) -> None:
+    """
+    Copies parameters from one network to another.
+
+    Parameters
+    ----------
+    net_source
+        Input network.
+    net_dest
+        Output network.
+    strict:
+        whether to strictly enforce that the keys
+        in :attr:`state_dict` match the keys returned by this module's
+        :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
+    """
+
+    net_dest.load_state_dict(net_source.state_dict(), strict=strict)
+
+
+def get_forward_input_names(module: Type[torch.nn.Module]):
+    params = inspect.signature(module.forward).parameters
+    param_names = [k for k, v in params.items() if not str(v).startswith("*")]
+    assert param_names[0] == "self", (
+        "Expected first argument of forward to be `self`, "
+        f"but found `{param_names[0]}`"
+    )
+    return param_names[1:]  # skip: self
+
+
+def weighted_average(
+    x: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None
+) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given dim, masking
+    values associated with weight zero,
+
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Parameters
+    ----------
+    x
+        Input tensor, of which the average must be computed.
+    weights
+        Weights tensor, of the same shape as `x`.
+    dim
+        The dim along which to average `x`
+
+    Returns
+    -------
+    Tensor:
+        The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(
+            weights != 0, x * weights, torch.zeros_like(x)
+        )
+        sum_weights = torch.clamp(
+            weights.sum(dim=dim) if dim else weights.sum(), min=1.0
+        )
+        return (
+            weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()
+        ) / sum_weights
+    else:
+        return x.mean(dim=dim)
+
+
+def lagged_sequence_values(
+    indices: List[int],
+    prior_sequence: torch.Tensor,
+    sequence: torch.Tensor,
+    dim: int,
+) -> torch.Tensor:
+    """
+    Constructs an array of lagged values from a given sequence.
+
+    Parameters
+    ----------
+    indices
+        Indices of the lagged observations. For example, ``[0]`` indicates
+        that, at any time ``t``, the will have only the observation from
+        time ``t`` itself; instead, ``[0, 24]`` indicates that the output
+        will have observations from times ``t`` and ``t-24``.
+    prior_sequence
+        Tensor containing the input sequence prior to the time range for
+        which the output is required.
+    sequence
+        Tensor containing the input sequence in the time range where the
+        output is required.
+    dim
+        Time dimension.
+
+    Returns
+    -------
+    Tensor
+        A tensor of shape (*sequence.shape, len(indices)).
+    """
+    assert max(indices) <= prior_sequence.shape[dim], (
+        f"lags cannot go further than prior sequence length, found lag"
+        f" {max(indices)} while prior sequence is only"
+        f" {prior_sequence.shape[dim]}-long"
+    )
+
+    full_sequence = torch.cat((prior_sequence, sequence), dim=dim)
+
+    lags_values = []
+    for lag_index in indices:
+        begin_index = -lag_index - sequence.shape[dim]
+        end_index = -lag_index if lag_index > 0 else None
+        lags_values.append(
+            slice_along_dim(
+                full_sequence, dim=dim, slice_=slice(begin_index, end_index)
+            )
+        )
+
+    return torch.stack(lags_values, dim=-1)
+
+
+def repeat_along_dim(a: torch.Tensor, dim: int, repeats: int) -> torch.Tensor:
+    """
+    Repeat a tensor along a given dimension, using ``torch.repeat`` internally.
+
+    Parameters
+    ----------
+    a
+        Original tensor to repeat.
+    dim
+        Dimension to repeat data over.
+    repeats
+        How many time to repeat the input tensor.
+
+    Returns
+    -------
+    torch.Tensor
+        A tensor with the same size as the input one, except dimension
+        ``dim`` which is multiplied by ``repeats``.
+    """
+    if repeats == 1:
+        return a
+    r = [1] * len(a.shape)
+    r[dim] = repeats
+    return a.repeat(*r)
+
+
+def slice_along_dim(a: torch.Tensor, dim: int, slice_: slice) -> torch.Tensor:
+    """
+    Slice a tensor along a given dimension.
+
+    Parameters
+    ----------
+    a
+        Original tensor to slice.
+    dim
+        Dimension to slice over.
+    slice_
+        Slice to take.
+
+    Returns
+    -------
+    torch.Tensor
+        A tensor with the same size as the input one, except dimension
+        ``dim`` which has length equal to the slice length.
+    """
+    idx = [slice(None)] * len(a.shape)
+    idx[dim] = slice_
+    return a[idx]
+
+
+def take_last(a: torch.Tensor, dim: int, num: int) -> torch.Tensor:
+    """
+    Take last elements from a given tensor along a given dimension.
+
+    Parameters
+    ----------
+    a
+        Original tensor to slice.
+    dim
+        Dimension to slice over.
+    num
+        Number of trailing elements to retain (non-negative).
+
+    Returns
+    -------
+    torch.Tensor
+        A tensor with the same size as the input one, except dimension
+        ``dim`` which has length equal to ``num``.
+    """
+    assert num >= 0
+    return slice_along_dim(a, dim, slice(a.shape[dim] - num, None))
+
+
+def unsqueeze_expand(a: torch.Tensor, dim: int, size: int) -> torch.Tensor:
+    """
+    Unsqueeze a dimension and expand over it in one go.
+
+    Parameters
+    ----------
+    a
+        Original tensor to unsqueeze.
+    dim
+        Dimension to unsqueeze.
+    size
+        Size for the new dimension.
+
+    Returns
+    -------
+    torch.Tensor
+        A tensor with an added dimension ``dim`` of size ``size``.
+    """
+    a = a.unsqueeze(dim)
+    sizes = list(a.shape)
+    sizes[dim] = size
+    return a.expand(*sizes)
diff --git a/megatron/laggpt/inference.py b/megatron/laggpt/inference.py
new file mode 100644
index 00000000..2ed1ce69
--- /dev/null
+++ b/megatron/laggpt/inference.py
@@ -0,0 +1,178 @@
+from megatron.neox_arguments import NeoXArgs
+from megatron.initialize import initialize_megatron
+from megatron.trainingTIMES import setup_model_and_optimizer
+from megatron.laggpt.dataloaders import combined_test_dataset_iterator
+from megatron import print_rank_0
+import megatron.mpu as mpu
+import torch
+import os, zarr, shutil
+
+
+
+
+class GenerationIterator:
+    """
+    During inference time, the same data iterators are used as in training and validation stages.
+    Initially, the future_target is replaced with a single non-observed point (non-observed values does not affect loss).
+    On each iteration, new values are inserted before non-observed point.
+    """
+    def __init__(self, iterator, times_envelope):
+        self.iterator = iterator
+        self.times_envelope = times_envelope
+        self.src = (mpu.get_pipe_parallel_rank() == 0 and mpu.get_model_parallel_rank() == 0)
+        self.keys = None
+        self.ground_truth, self.past_target, self.future = None, None, None
+
+    @torch.no_grad()
+    def set_new_vals(self, new_vals):
+        self.future = torch.hstack([self.future, new_vals]) if self.future is not None else new_vals
+
+    def get_batch_size(self):
+        return self.batch["past_target"].size(0)
+
+    def get_batch_device(self):
+        return self.batch["past_target"].device
+
+    def start_new_seq(self):
+        self.new_seq = True
+        
+        self.iterator.update_buffer()
+        self.batch = next(self.iterator)
+        self.ground_truth = self.batch["future_target"]
+        self.past_target = self.batch["past_target"]
+
+        self.loc, self.scale = self.times_envelope.get_loc_scale(self.batch)
+        self.future = None
+
+    @torch.no_grad()
+    def __next__(self):
+
+        batch = self.batch
+      
+        past_target = batch["past_target"]
+        batch_dim = past_target.size(0)
+        future_values = torch.zeros((batch_dim, 1), dtype = past_target.dtype, device = past_target.device)
+        future_observed_values = torch.zeros((batch_dim, 1), dtype = past_target.dtype, device = past_target.device)
+
+        if self.future is not None:
+            future_values =  torch.hstack([self.future, future_values])
+            ones = torch.ones((batch_dim, self.future.size(-1)), dtype = torch.float32, device = past_target.device)
+            future_observed_values = torch.hstack([ones, future_observed_values])
+        batch["future_target"] = future_values
+        batch["future_observed_values"] = future_observed_values
+
+        return batch
+    
+
+def initialize():
+    """
+    An attemnt to rewrite GPT-NeoX options to tailor them for inference stage. 
+    TODO: gradient_accumulation_steps override does not work and need to be changed manually in config file.  
+    """
+    _overwrite_values = {
+        "gradient_accumulation_steps": 1,
+        "checkpoint_activations": False,
+        "partition_activations": False,
+        "no_load_optim": True,
+        "optimizer": None,  # prevent loading optimizer (no_load_optim alone won't work)
+        "zero_optimization": None,  # disable zero optimization (won't be used in inference, and loading zero optimizer can cause errors)
+    }
+    
+    neox_args = NeoXArgs.consume_neox_args(overwrite_values=_overwrite_values)
+    neox_args.configure_distributed_args()
+    
+    
+
+    # initialize megatron
+    initialize_megatron(neox_args)
+    model, times_envelope , _, _ = setup_model_and_optimizer(neox_args, use_cache = False)
+    data_iterator = combined_test_dataset_iterator(neox_args)
+    return neox_args, model, times_envelope, data_iterator
+
+
+def process_batch(model, times_envelope, gen_iterator, prediction_length):
+    
+    data_nodes = (model.is_first_stage() or model.is_last_stage())
+    pipe_rank_shift = (mpu.get_pipe_parallel_world_size() - 1) * mpu.get_model_parallel_world_size() * mpu.get_data_parallel_world_size()
+
+    if data_nodes:
+        gen_iterator.start_new_seq()
+    
+    #model.module.clear_cache()
+    for i in range(prediction_length):
+
+        # clear tensor metadata
+        model.first_output_send = True
+        model.pipe_recv_buf = None
+
+        print_rank_0(">    Token: ", i)
+        _, outputs = model.eval_batch(gen_iterator, return_logits = True)
+        
+        if model.is_last_stage():
+            last_column = [i[:, -1:, ...] for i in outputs]
+            loc, scale = gen_iterator.loc, gen_iterator.scale
+            new_vals = times_envelope.get_greedy_val(last_column, loc, scale)
+            torch.distributed.send(new_vals, torch.distributed.get_rank() - pipe_rank_shift)
+        if model.is_first_stage():
+            new_vals = torch.empty((gen_iterator.get_batch_size(), 1), dtype = torch.float32, device = gen_iterator.get_batch_device())
+            torch.distributed.recv(new_vals, torch.distributed.get_rank() + pipe_rank_shift)
+
+        if data_nodes:
+            gen_iterator.set_new_vals(new_vals)
+
+    ground_truth = to_numpy(gen_iterator.ground_truth)
+    past_target = to_numpy(gen_iterator.past_target)
+    future = to_numpy(gen_iterator.future)
+
+    return ground_truth, past_target, future
+
+
+def inference(neox_args, model, times_envelope, data_iterator):
+    """
+    Main inference routine. 
+    TODO: need to be changed to handle larger output, caching is not tested and turned off for now.
+    """
+
+    times_args = neox_args.times_args
+    inference_opt = times_args["inference"]
+    chunk_size = inference_opt["chunk_size"]
+    prediction_length = times_args["prediction_length"]
+    gen_iterator = GenerationIterator(data_iterator, times_envelope)
+    output_filename = inference_opt["file_name"]
+    
+    if torch.distributed.get_rank() == 0:
+        if os.path.exists(output_filename) and os.path.isdir(output_filename):
+            shutil.rmtree(output_filename)
+
+    torch.distributed.barrier()
+
+    write_nodes = (mpu.get_pipe_parallel_rank() == 0 and mpu.get_model_parallel_rank() == 0)
+
+    if write_nodes:
+        root = zarr.open(output_filename)
+        group = root.create_group(f"data{mpu.get_data_parallel_rank():04d}.json")
+
+    def create_array(name, array):
+        return group.array(name, array, chunks = (chunk_size, array.shape[-1]))
+
+    for i in range(inference_opt["num_test_batches"]):
+        print_rank_0("> Batch index: ", i)
+        ground_truth, past_target, future = process_batch(model, times_envelope, gen_iterator, prediction_length)
+        if write_nodes:
+            if i == 0:
+                ground_truth_array = create_array("ground_truth", ground_truth)
+                past_target_array = create_array("past_target", past_target)
+                future_array = create_array("future", future)
+            else:
+                ground_truth_array.append(ground_truth)
+                past_target_array.append(past_target)
+                future_array.append(future)
+
+    
+
+
+def to_numpy(tensor):
+    if tensor is None:
+        return None
+    return tensor.cpu().numpy()
+
diff --git a/megatron/laggpt/layers.py b/megatron/laggpt/layers.py
new file mode 100644
index 00000000..b6144813
--- /dev/null
+++ b/megatron/laggpt/layers.py
@@ -0,0 +1,46 @@
+import torch.nn.init as init
+from megatron.mpu.layers import ColumnParallelLinear
+
+
+
+
+class ProjectionPipe(ColumnParallelLinear):
+    """
+    Modification of ColumnParallelLinear to pass attention mask through the projection layer.
+    """
+    def __init__(
+        self,
+        neox_args,
+        input_size,
+        output_size,
+        bias=True,
+        gather_output=True,
+        init_method=init.xavier_normal_,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        mup_rescale_parameters=False,
+    ):
+        super(ProjectionPipe, self).__init__(
+            neox_args,
+            input_size,
+            output_size,
+            bias,
+            gather_output,
+            init_method,
+            stride,
+            keep_master_weight_for_test,
+            skip_bias_add,
+            mup_rescale_parameters,
+        )
+
+    def forward(self, input_):
+
+        input_, attn_mask = input_
+        # Second argument is bias if skip_bias_add is not None
+        output, _ = super(ProjectionPipe, self).forward(input_)
+        return output, attn_mask
+
+    
+
+    
\ No newline at end of file
diff --git a/megatron/laggpt/transformer_envelope.py b/megatron/laggpt/transformer_envelope.py
new file mode 100644
index 00000000..45ae893b
--- /dev/null
+++ b/megatron/laggpt/transformer_envelope.py
@@ -0,0 +1,195 @@
+import torch
+from torch import nn
+from gluonts.itertools import prod
+from gluonts.time_feature import get_lags_for_frequency
+
+
+
+from .gluontstorch.utils import take_last, repeat_along_dim, lagged_sequence_values, unsqueeze_expand
+from .gluontstorch.scaler import StdScaler, MeanScaler, NOPScaler
+from .gluontstorch.studentt import StudentTOutput
+from .gluontstorch.loss import DistributionLoss, NegativeLogLikelihood
+
+
+from megatron.utils import get_attn_mask
+from megatron import print_rank_0
+
+
+class TransformerEnvelope:
+    """
+    A class that defines input scaling, batch funtion, loss and calculation of greedy vals for lag-GPT.
+    Code is taken from original lag-GPT model.
+    """
+    def __init__(
+            self, context_length, 
+            scaling,
+            hidden_size,
+            distribution_head=StudentTOutput(),
+            loss: DistributionLoss = NegativeLogLikelihood()):
+        
+
+        self.embedding_dim = hidden_size
+        self.probabilistic_loss = loss
+
+        self.context_length = context_length
+
+        if scaling == "mean":
+            print_rank_0("> mean scaling ...")
+            self.scaler = MeanScaler(keepdim=True, dim=1)
+        elif scaling == "std":
+            self.scaler = StdScaler(keepdim=True, dim=1)
+            print_rank_0("> std scaling ...")
+        else:
+            self.scaler = NOPScaler(keepdim=True, dim=1)
+
+        self.lags_seq = sorted(
+            list(
+                set(
+                    get_lags_for_frequency(freq_str="Q", num_default_lags=1)
+                    + get_lags_for_frequency(freq_str="M", num_default_lags=1)
+                    + get_lags_for_frequency(freq_str="W", num_default_lags=1)
+                    + get_lags_for_frequency(freq_str="D", num_default_lags=1)
+                    + get_lags_for_frequency(freq_str="H", num_default_lags=1)
+                    + get_lags_for_frequency(freq_str="T", num_default_lags=1)
+                    + get_lags_for_frequency(freq_str="S", num_default_lags=1)
+                )
+            )
+        )
+        self.dist_head = distribution_head
+        n_scaling_factors = 2
+        self._feature_size= len(self.lags_seq) + n_scaling_factors
+    
+
+    @property
+    def distribution_projection(self):
+        projection = self.dist_head.get_args_proj(self.embedding_dim).cuda()
+        for param in projection.parameters():
+            torch.distributed.broadcast(param, src=0)
+        return projection
+
+    @property
+    def feature_size(self):
+        return self._feature_size
+
+    @property
+    def modules(self):
+        return self._modules
+
+    @property
+    def past_length(self) -> int:
+        return self.context_length + max(self.lags_seq)
+    
+
+    def get_loc_scale(self, batch):
+        past_target = batch["past_target"]
+        past_observed_values = batch["past_observed_values"]
+        _, loc, scale = self.scaler(past_target, past_observed_values)
+        return loc, scale
+
+
+    def batch_fn(self, batch):
+       
+        with torch.no_grad():
+            past_target = batch["past_target"]
+            past_observed_values = batch["past_observed_values"]
+            future_target = batch["future_target"]
+            future_observed_values = batch["future_observed_values"]
+            
+            # TODO, not supported
+            extra_dims = len(future_target.shape) - len(past_target.shape)
+            extra_shape = future_target.shape[:extra_dims]
+            
+            repeats = prod(extra_shape)
+            past_target = repeat_along_dim(past_target, 0, repeats)
+            past_observed_values = repeat_along_dim(past_observed_values, 0, repeats)
+
+            future_target = future_target.reshape(
+                -1,
+                *future_target.shape[extra_dims + 1 :],
+            )
+            future_observed = future_observed_values.reshape(
+                -1,
+                *future_observed_values.shape[extra_dims + 1 :],
+            )
+
+            scaled_past_target, loc, scale = self.scaler(past_target, past_observed_values)
+            
+            if future_target is not None:
+                future_length = future_target.shape[1]
+                input = torch.cat(
+                    (
+                        scaled_past_target[..., -self.context_length :],
+                        (future_target[..., : future_length - 1] - loc) / scale,
+                    ),
+                    dim=-1,
+                )
+            else:
+                input = scaled_past_target[..., -self.context_length :]
+
+            
+            if future_target is not None:
+                future_length = future_target.shape[1]
+                input = torch.cat(
+                    (
+                        scaled_past_target[..., -self.context_length :],
+                        (future_target[..., : future_length - 1] - loc) / scale,
+                    ),
+                    dim=-1,
+                )
+            else:
+                input = scaled_past_target[..., -self.context_length :]
+
+
+            prior_input = (past_target[..., : -self.context_length] - loc) / scale
+            lags = lagged_sequence_values(self.lags_seq, prior_input, input, dim=-1)
+            
+            static_feat = torch.cat((loc.abs().log1p(), scale.log()), dim=-1)
+            expanded_static_feat = unsqueeze_expand(
+                static_feat, dim=-2, size=lags.shape[-2]
+            )
+            
+            tokens = torch.cat((lags, expanded_static_feat), dim=-1)
+            seq_length = tokens.shape[1]
+            attn_mask = get_attn_mask(seq_length=seq_length, device=tokens.device)
+
+            pipe_tuple = (
+                (tokens, attn_mask), 
+                (
+                    past_target, future_target, 
+                    past_observed_values, future_observed, loc, scale
+                )
+            )
+        
+            return pipe_tuple
+
+
+    #past_target, future_target, past_observed_values, future_observed, loc, scale
+    def loss(self, dist_args, pipe_tuple):
+    
+        past_target, future_target, past_observed_values, future_observed, loc, scale = pipe_tuple
+        dist = self.dist_head.distribution(dist_args, loc, scale)
+        
+        context_target = take_last(
+            past_target, dim=-1, num=self.context_length - 1
+        )
+
+        target = torch.cat(
+            (context_target, future_target),
+            dim=1,
+        )
+        
+        context_observed = take_last(
+            past_observed_values, dim=-1, num=self.context_length - 1
+        )
+        observed_values = torch.cat((context_observed, future_observed), dim=1)
+        result_loss = (self.probabilistic_loss(dist, target) * observed_values).sum() / observed_values.sum().clamp_min(1.0)
+        
+        return result_loss
+
+
+    #past_target, future_target, past_observed_values, future_observed, loc, scale
+    def get_greedy_val(self, dist_args, loc, scale):
+    
+        dist = self.dist_head.distribution(dist_args, loc, scale)
+        return dist.mean
+        
\ No newline at end of file
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 9af46de9..6a3ef592 100755
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -16,5 +16,6 @@
 # limitations under the License.
 
 from .gpt2_model import GPT2ModelPipe
+from .lag_gpt_model import LagGPT
 from .utils import get_params_for_weight_decay_optimization
 from .word_embeddings import SoftEmbedding
diff --git a/megatron/model/lag_gpt_model.py b/megatron/model/lag_gpt_model.py
new file mode 100644
index 00000000..0b57e28e
--- /dev/null
+++ b/megatron/model/lag_gpt_model.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2021 EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT-2 model."""
+
+import math
+import torch
+import torch.nn as nn
+from collections import defaultdict
+
+from megatron.model.utils import Lambda, SequentialWrapper, recursive_setattr
+from megatron.model.norms import get_norm
+from megatron.model.init_functions import get_init_methods
+
+from megatron import mpu
+from megatron.mpu import ParallelRelativePositionBias
+from megatron.model.transformer import (
+    ParallelTransformerLayerPipe,
+    NormPipe,
+    ParallelLinearPipe,
+    parallel_lm_logits,
+    ParallelLinear,
+)
+from megatron.model.gmlp import GMLPBlock
+#from megatron.model.word_embeddings import EmbeddingPipe, SoftEmbedding
+from megatron.mpu import ColumnParallelLinear
+from megatron.laggpt import ProjectionPipe
+
+# Pipeline parallelism
+from deepspeed.pipe import PipelineModule, LayerSpec
+from typing import Union, List
+
+
+def gpt2_attention_mask_func(attention_scores, ltor_mask):
+    mask_value = torch.finfo(attention_scores.dtype).min
+    # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+    # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+    mask_value = torch.tensor(mask_value, dtype=attention_scores.dtype, device=attention_scores.device)
+    attention_scores.masked_fill_(ltor_mask, mask_value)
+    return attention_scores
+
+
+def cross_entropy(output, labels, _fp16=False):
+    """From pretrain_gpt2:forward_step()"""
+    """
+    if self.fp16_lm_cross_entropy:
+        assert output.dtype == torch.half
+        loss = mpu.vocab_parallel_cross_entropy(output, labels)
+    else:
+        loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        return loss
+    """
+    labels, loss_mask = labels[0], labels[1]
+    if _fp16:
+        assert output.dtype == torch.half and loss_mask.dtype == torch.half
+        losses = mpu.vocab_parallel_cross_entropy(output.contiguous(), labels)
+    else:
+        losses = mpu.vocab_parallel_cross_entropy(output.float().contiguous(), labels)
+    loss_mask = loss_mask.view(-1)
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+    return loss
+
+
+def _pre_transformer_block(args):
+    # data format change for hidden_states to avoid explicit tranposes : [b s h] --> [s b h]
+    assert len(args) == 2, "Incorrect number of arguments to _pre_transformer_block"
+    fn = lambda _args: (_args[0].transpose(0, 1).contiguous(), *_args[1:])
+    return fn(args)
+
+
+def _post_transformer_block(args):
+    # from (hidden_states, attention_mask)
+    # to (hidden_states.T)
+    assert len(args) == 2, "Incorrect number of arguments to _post_transformer_block"
+    fn = lambda _args: (_args[0].transpose(0, 1).contiguous())
+    return fn(args)
+
+
+class LagGPT(PipelineModule, torch.nn.Module):
+    """GPT2Model adapted for pipeline parallelism.
+
+    The largest change is flattening the GPTModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+
+    :param neox_args: NeoX arguments object (configuration)
+    :param num_tokentypes: number of token types (TODO: deprecated, remove)
+    :param parallel_output: if true, don't gather the output logits, and calculate loss in parallel. Set to true by default in training for efficiency, but set to false for inference.
+    :param topology: deepspeed topology object specifying pipe / model parallelism topology.
+    :param use_cache: if true, cache key/value pairs for each layer in inference.
+    """
+
+    def __init__(
+        self,
+        neox_args,
+        envelope,
+        num_tokentypes=0,
+        parallel_output=True,
+        topology=None,
+        use_cache=False,
+    ):
+        self.neox_args = neox_args
+        self.envelope = envelope
+        self.use_cache = use_cache
+        self.parallel_output = parallel_output
+        self.hidden_size = self.neox_args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method, self.output_layer_init_method = get_init_methods(
+            self.neox_args
+        )
+        self.__topology__ = topology
+
+        self.specs = []
+        self.init_specs()  # initializes the layer specs (basically a fancy nn.Sequential)
+
+        super().__init__(
+            layers=self.specs,
+            loss_fn=envelope.loss,
+            topology=topology,
+            activation_checkpoint_interval=self.neox_args.checkpoint_num_layers
+            if self.neox_args.checkpoint_activations
+            else 0,
+            partition_method=neox_args.pipe_partition_method,
+            checkpointable_layers=["GMLPBlock", "ParallelTransformerLayerPipe"],
+        )
+
+    def insert_layers(
+        self, layers: Union[nn.Module, nn.ModuleList, nn.Sequential, List], idx
+    ):
+        """
+        inserts the layers in `layers` into the pipe model at `idx`.
+        """
+        if isinstance(layers, nn.Module):
+            self.specs.insert(idx, layers)
+        elif any(
+            [isinstance(layers, nn.ModuleList), isinstance(layers, nn.Sequential)]
+        ):
+            self.specs[idx:idx] = layers
+        elif isinstance(layers, list):
+            assert all(
+                [hasattr(l, "__call__") for l in layers]
+            ), "all items in `layers` must be Callables"
+            self.specs[idx:idx] = layers
+        else:
+            raise ValueError(
+                f"layer passed into {self.__class__.__name__}.insert_layer() should be either an nn.Module, an nn.ModuleList, an nn.Sequential object, or a list of callables not a {type(layers)}"
+            )
+
+        # re-initialize parent class
+        super().__init__(
+            layers=self.specs,
+            loss_fn=self.loss_fn,
+            topology=self.__topology__,
+            activation_checkpoint_interval=self.activation_checkpoint_interval,
+            partition_method=self.neox_args.pipe_partition_method,
+            checkpointable_layers=["GMLPBlock", "ParallelTransformerLayerPipe"],
+        )
+
+    def init_specs(self):
+
+        
+        self.specs = []
+
+        # Embedding layer
+        # input will be (input_ids, position_ids, attention_mask)
+
+        self.specs.append(
+            LayerSpec(
+                ProjectionPipe,
+                self.neox_args,
+                self.envelope.feature_size,
+                self.hidden_size,
+                bias = True,
+                gather_output = True,
+            )
+        )
+        
+        # NB: the attention mask always needs to be the *last* item in the args when being passed from
+        # one stage to the next, because deepspeed is hacks on top of hacks.
+        #
+        # outputs are now (hidden_states,  attention_mask)
+
+        self.specs.append(_pre_transformer_block)
+
+        # T5 RPE positional embedding
+        if self.neox_args.pos_emb == "rpe":
+            hidden_size_per_attention_head = mpu.divide(
+                self.neox_args.hidden_size, self.neox_args.num_attention_heads
+            )
+            rpe_scale = math.sqrt(hidden_size_per_attention_head)
+            rpe_emb = ParallelRelativePositionBias(
+                neox_args=self.neox_args,
+                scale=rpe_scale,
+                causal=True,
+                num_buckets=self.neox_args.rpe_num_buckets,
+                max_distance=self.neox_args.rpe_max_distance,
+                heads=self.neox_args.num_attention_heads,
+            )
+
+        # Transformer layers
+        for i in range(self.neox_args.num_layers):
+            layer_type = self.neox_args.attention_config[i]
+            if layer_type in ["gmlp", "amlp"]:
+                self.specs.append(
+                    LayerSpec(
+                        GMLPBlock,
+                        init_method=self.init_method,
+                        layer_number=i,
+                        output_layer_init_method=self.output_layer_init_method,
+                        neox_args=self.neox_args,
+                        mask_fn=gpt2_attention_mask_func,
+                    )
+                )
+            else:
+                self.specs.append(
+                    LayerSpec(
+                        ParallelTransformerLayerPipe,
+                        neox_args=self.neox_args,
+                        attention_mask_func=gpt2_attention_mask_func,
+                        init_method=self.init_method,
+                        output_layer_init_method=self.output_layer_init_method,
+                        layer_number=i,
+                        rpe=rpe_emb if self.neox_args.pos_emb == "rpe" else None,
+                        rotary=self.neox_args.pos_emb == "rotary",
+                        use_cache=self.use_cache,
+                    )
+                )
+
+        # used to drop attention mask + reshape hidden states
+        self.specs.append(_post_transformer_block)
+
+        # NormPipe is a (deprecated) helper class that used to be used to pass presents along the pipeline - since presents are now cached to the `TransformerLayer` class this is no longer needed
+        norm, eps = get_norm(self.neox_args)
+        self.specs.append(
+            LayerSpec(NormPipe, norm, self.neox_args.hidden_size, eps=eps)
+        )
+
+        self.specs.append(self.envelope.distribution_projection)
+
+
+    def _set_parallel_output(self, value):
+        # sets the parallel output value of the final layer to value
+        final_layer = list(self.forward_funcs)[-1]
+        if isinstance(final_layer, (ParallelLinearPipe, ParallelLinear)):
+            final_layer.final_linear.set_parallel_output(value)
+
+    def inference_mode(self, use_cache=True):
+        """
+        Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
+        so logits are gathered across model parallel ranks.
+
+        :param cache: (bool) True if you want to use caching during inference, False otherwise
+        """
+        # first set caching to true if specified
+        recursive_setattr(self.forward_funcs, "use_cache", use_cache, assert_type=bool)
+        # then set parallel output of the final layer to false so we don't have to gather the output manually
+        self._set_parallel_output(False)
+        recursive_setattr(self.forward_funcs, "training", False)
+
+    def train_mode(self):
+        """
+        Sets up the model for training by turning off k/v caching and setting `parallel output` of the final layer to True,
+        so logits are not gathered across model parallel ranks, and loss is computed in parallel (more efficient).
+        """
+        # set caching to false
+        recursive_setattr(self.forward_funcs, "use_cache", False)
+        # then set parallel output to true (more efficient training)
+        self._set_parallel_output(True)
+        recursive_setattr(self.forward_funcs, "training", True)
+
+    def clear_cache(self):
+        """
+        Recursively clears the kv cache on all layers
+        """
+        recursive_setattr(self.forward_funcs, "layer_past", None)
+
+    def to_sequential(self):
+        """
+        Transforms the PipelineModule to a plain nn.Sequential module
+        :return:
+        """
+        layers = []
+        tied_layers = defaultdict(list)
+        for n, spec in enumerate(self.specs):
+            if isinstance(spec, TiedLayerSpec):
+                if spec.key in tied_layers:
+                    # receiver
+                    layers.append(
+                        Lambda(lambda x: spec.forward_fn(tied_layers[spec.key][0], x))
+                    )
+                else:
+                    # owner
+                    module = spec.build(log=False)
+                    layers.append(module)
+                    tied_layers[spec.key].append(module)
+            elif isinstance(spec, LayerSpec):
+                layers.append(spec.build(log=False))
+            elif hasattr(spec, "__call__"):
+                # check that it's a callable function
+                layers.append(Lambda(spec))
+            else:
+                raise ValueError(f"Layer number {n} ({spec}) Not recognized")
+        model = SequentialWrapper(
+            layers,
+            self.activation_checkpoint_interval,
+            self.activation_checkpoint_func,
+            parent_class_name=self.__class__.__name__,
+        )
+        return model
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 488baf04..86a98286 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -77,6 +77,7 @@ def __init__(
             self.embedding_module = torch.nn.Embedding
 
         # Position embedding (serial).
+        
         self.use_pos_emb = use_pos_emb
         if self.use_pos_emb:
             self.embedding_type = neox_args.pos_emb
diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py
index 611d2adb..cf6bb6d6 100644
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -16,7 +16,7 @@
 
 from .cross_entropy import vocab_parallel_cross_entropy
 
-from .data import broadcast_data
+from .data import broadcast_data, broadcast_data_first_last_layer_group
 
 from .initialize import is_unitialized
 from .initialize import destroy_model_parallel
@@ -25,7 +25,7 @@
 from .initialize import get_data_parallel_world_size
 from .initialize import get_model_parallel_group
 from .initialize import get_model_parallel_rank, set_model_parallel_rank
-from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank
+from .initialize import get_model_parallel_src_rank, get_data_parallel_src_rank, get_model_layer_group_and_src
 from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
 from .initialize import get_topology
 from .initialize import get_pipe_parallel_group
diff --git a/megatron/mpu/data.py b/megatron/mpu/data.py
index 7b937053..70c79714 100644
--- a/megatron/mpu/data.py
+++ b/megatron/mpu/data.py
@@ -17,6 +17,8 @@
 from .initialize import get_model_parallel_group
 from .initialize import get_model_parallel_rank
 from .initialize import get_model_parallel_src_rank
+from .initialize import get_model_layer_group_and_src
+
 
 
 _MAX_DATA_DIM = 4
@@ -118,3 +120,112 @@ def broadcast_data(keys, data, datatype):
         offset += numel
 
     return output
+
+
+
+def _build_key_size_numel_dictionaries_first_last_layer_group(keys, data, src):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    layer_group, layer_src = get_model_layer_group_and_src()
+    # Pack the sizes on rank zero.
+    if src:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda, layer_src, group=layer_group
+    )
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1 if sizes_cpu[offset] > 0 else 0
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+    return key_size, key_numel, total_numel
+
+
+def broadcast_keys_first_last_layer_group(keys, src = False):
+
+    layer_group, layer_src = get_model_layer_group_and_src()
+    size = len(keys) if keys else 0
+    size = torch.as_tensor(size).cuda()
+    
+    torch.distributed.broadcast(
+        size, layer_src, group=layer_group
+    )
+    keys = keys if keys else [None] * size.cpu()
+    torch.distributed.broadcast_object_list(keys, layer_src, group = layer_group)
+    return keys
+
+
+
+def broadcast_data_first_last_layer_group(keys, data, datatype, src = False):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data dictionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+  
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries_first_last_layer_group(keys, data, src)
+    
+    layer_group, layer_src = get_model_layer_group_and_src()
+
+    # Pack on rank zero.
+    if src:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0
+        ).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype
+        )
+
+    # Broadcast
+    torch.distributed.broadcast(
+        flatten_data, layer_src, group=layer_group
+    )
+
+    # Unpack
+    output = {}
+    offset = 0
+
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
+
diff --git a/megatron/mpu/initialize.py b/megatron/mpu/initialize.py
index 325e46ba..df9e5407 100644
--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
@@ -29,6 +29,10 @@
 # Pipeline parallel group that the current rank belongs to.
 _PIPE_PARALLEL_GROUP = None
 
+# Group with all first and last layers ranks
+_LAYER_GROUP = None
+_LAYER_SRC = None
+
 # A group used to sync during the IO process. Usually this is data_parallel_group(),
 # but with pipeline parallelism it must also involve the last stage (which is not in the
 # DP group of rank 0)
@@ -69,6 +73,8 @@ def initialize_model_parallel(model_parallel_size, topology=None, fp32_allreduce
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
     ranks 8 to 15 belong to the second box.
     """
+
+    
     if torch.distributed.get_rank() == 0:
         print("> initializing model parallel with size {}".format(model_parallel_size))
     # Get world size and rank. Ensure some consistencies.
@@ -110,6 +116,7 @@ def initialize_model_parallel(model_parallel_size, topology=None, fp32_allreduce
             if rank in pp_group:
                 _PIPE_PARALLEL_GROUP = group
 
+    
     # Build IO group
     global _IO_PARALLEL_GROUP
     if topology and topology.get_dim("pipe") > 1:
@@ -125,6 +132,31 @@ def initialize_model_parallel(model_parallel_size, topology=None, fp32_allreduce
     else:
         _IO_PARALLEL_GROUP = get_data_parallel_group()
 
+
+    global _FP32_ALLREDUCE
+    assert _FP32_ALLREDUCE is None, "fp32_allreduce is already initialized"
+    _FP32_ALLREDUCE = fp32_allreduce
+
+    assert topology, "only supported initialization with topology"
+   
+    global _LAYER_GROUP
+    global _LAYER_SRC
+    n_pipe = topology.get_dim("pipe")
+    for i in range(topology.get_dim("data")):
+        src = topology.get_rank(pipe = 0, model = 0, data = i)
+        group = topology.filter_match(pipe = 0, data = i)
+        if n_pipe > 1:
+            group += topology.filter_match(pipe = n_pipe - 1, data = i)
+
+        if rank == 0:
+            print(f"MPU LAYERS: {group} for data axis: {i}")
+        dist_group = torch.distributed.new_group(ranks=group)
+        if rank in group:
+            _LAYER_GROUP = dist_group
+            _LAYER_SRC = src
+    
+
+
     # Build the model parallel groups.
     global _MODEL_PARALLEL_GROUP
     assert _MODEL_PARALLEL_GROUP is None, "model parallel group is already initialized"
@@ -154,9 +186,12 @@ def initialize_model_parallel(model_parallel_size, topology=None, fp32_allreduce
             if i == (rank // model_parallel_size):
                 _MODEL_PARALLEL_GROUP = group
 
-    global _FP32_ALLREDUCE
-    assert _FP32_ALLREDUCE is None, "fp32_allreduce is already initialized"
-    _FP32_ALLREDUCE = fp32_allreduce
+
+
+    
+
+def get_model_layer_group_and_src():
+    return _LAYER_GROUP, _LAYER_SRC
 
 
 def model_parallel_is_initialized():
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 7a43371e..f03ef8e1 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -45,6 +45,7 @@
     NeoXArgsTextgen,
     NeoXArgsOptimizer,
     NeoXArgsLRScheduler,
+    NeoXArgsTimesModel,
     ATTENTION_TYPE_CHOICES,
 )
 
@@ -95,6 +96,7 @@
     NeoXArgsLogging,
     NeoXArgsTextgen,
     NeoXArgsOther,
+    NeoXArgsTimesModel
 ]
 
 DEEPSPEED_ARG_CLASSES = [NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig]
@@ -423,12 +425,16 @@ def consume_neox_args(cls, overwrite_values=None):
         megatron_config = json.loads(
             base64.urlsafe_b64decode(args_parsed.megatron_config).decode("utf-8")
         )
-        if args_parsed.deepspeed_config is not None:
+        
+        
+        if not overwrite_values and args_parsed.deepspeed_config is not None:
             overwrite_values = cls.set_up_autotuning(
                 args_parsed.deepspeed_config, overwrite_values
             )
+        
         if overwrite_values is not None:
             megatron_config.update(overwrite_values)
+        
         return cls.from_dict(args_dict=megatron_config)
 
     @staticmethod
@@ -546,6 +552,7 @@ def get_deepspeed_main_args(self):
         # get deepspeed_config
         args_list.append("--deepspeed_config")
 
+        
         if self.autotuning_run is not None:
             ds_fp = cwd / Path("ds_config.json")
             if self.rank == 0:
@@ -558,6 +565,8 @@ def get_deepspeed_main_args(self):
             ).decode("utf-8")
             args_list.append(encoded_ds_config)
 
+
+
         # get all config values
         args_list.append("--megatron_config")
         neox_args = self.get_parent_class_value_dict(
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index e1ea16a1..1298f357 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -84,6 +84,11 @@ class NeoXArgsParallelism(NeoXArgsTemplate):
     """
 
 
+@dataclass
+class NeoXArgsTimesModel(NeoXArgsTemplate):
+    times_args: dict = None
+
+
 @dataclass
 class NeoXArgsModel(NeoXArgsTemplate):
     """
diff --git a/megatron/training.py b/megatron/trainingGPT.py
similarity index 100%
rename from megatron/training.py
rename to megatron/trainingGPT.py
diff --git a/megatron/trainingTIMES.py b/megatron/trainingTIMES.py
new file mode 100644
index 00000000..348b36ac
--- /dev/null
+++ b/megatron/trainingTIMES.py
@@ -0,0 +1,762 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file has been modified from its original version
+#
+
+"""Pretrain utilities."""
+from datetime import datetime
+from functools import partial
+
+import math
+import sys
+
+import torch
+import deepspeed
+from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
+import numpy as np
+
+from megatron.utils import (
+    Timers,
+    init_wandb,
+    get_ltor_masks_and_position_ids,
+    reduce_losses,
+)
+
+from megatron import print_rank_0, mpu
+from megatron.model import (
+    LagGPT,
+    SoftEmbedding,
+    get_params_for_weight_decay_optimization,
+)
+from megatron.laggpt import TransformerEnvelope, combined_dataset_iterator
+from megatron.checkpointing import load_checkpoint, save_checkpoint
+from megatron.data.data_utils import build_train_valid_test_data_iterators
+from megatron.initialize import initialize_megatron
+from megatron.learning_rates import AnnealingLR
+from megatron.logging import tb_wandb_log, training_log
+from megatron.utils import (
+    OverflowMonitor,
+    get_noise_scale_logger,
+    get_total_params,
+    CharCounter,
+)
+from megatron.model.gpt2_model import cross_entropy
+from eval_tasks import run_eval_harness
+
+
+
+def pretrain(neox_args):
+    """Main training program.
+
+    This function will run the following in the order provided:
+        1) initialize Megatron.
+        2) setup model, optimizer and lr schedule
+        3) call train_val_test_data_provider to get train/val/test datasets.
+        4) train the model.
+
+    Arguments:
+        neox_args: an instance of NeoXArgs containing the configuration for pretrain
+
+    """
+    # setup logging and timers
+    init_wandb(neox_args=neox_args)
+    timers = Timers(
+        use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
+    )
+    print_rank_0(neox_args.times_args)
+
+    # Initialize and get arguments, timers, and Tensorboard writer.
+    initialize_megatron(neox_args=neox_args)
+
+    # Model, optimizer, and learning rate.
+    timers("model and optimizer").start()
+    model, _, optimizer, lr_scheduler = setup_model_and_optimizer(
+        neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
+    )
+    timers("model and optimizer").stop()
+
+    # Data stuff.
+    timers("train/valid/test data iterators").start()
+
+    (
+        train_data_iterator,
+        valid_data_iterator,
+        test_data_iterator,
+    ) = combined_dataset_iterator(neox_args=neox_args)
+    timers("train/valid/test data iterators").stop()
+    
+    if neox_args.use_mup and neox_args.coord_check:
+        raise ValueError("MUP is not supported by Times-NeoX.")
+
+    # Print setup timing.
+    print_rank_0("done with setups ...")
+    timers.log(["model and optimizer", "train/valid/test data iterators"])
+    print_rank_0("training ...")
+
+    iteration = neox_args.iteration
+    if neox_args.do_train and neox_args.train_iters > 0:
+        # edge case: save step 0 checkpoint if requested and we're starting from step 0
+        if neox_args.save and 0 in neox_args.save_iters and iteration == 0:
+            save_checkpoint(
+                neox_args=neox_args,
+                iteration=iteration,
+                model=model,
+                optimizer=optimizer,
+                lr_scheduler=lr_scheduler,
+            )
+
+        iteration = train(
+            neox_args=neox_args,
+            timers=timers,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            train_data_iterator=train_data_iterator,
+            valid_data_iterator=valid_data_iterator,
+        )
+
+    if neox_args.do_valid:
+        prefix = "the end of training for val data"
+        evaluate_and_print_results(
+            neox_args=neox_args,
+            prefix=prefix,
+            forward_step_func=forward_step,
+            data_iterator=valid_data_iterator,
+            model=model,
+            iteration=iteration,
+            verbose=False,
+            timers=timers,
+        )
+
+    if neox_args.save and iteration != 0:
+        save_checkpoint(
+            neox_args=neox_args,
+            iteration=iteration,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+        )
+
+    if neox_args.do_test:
+        # Run on test data.
+        prefix = "the end of training for test data"
+        evaluate_and_print_results(
+            neox_args=neox_args,
+            prefix=prefix,
+            forward_step_func=forward_step,
+            data_iterator=test_data_iterator,
+            model=model,
+            iteration=iteration,
+            verbose=True,
+            timers=timers,
+            chart_name="test",
+        )
+
+
+
+def forward_step(
+    data_iterator, model, neox_args, timers, return_logits=False, is_train=False
+):
+    """Forward step."""
+    if neox_args.is_pipe_parallel:
+        return model.eval_batch(data_iterator, return_logits=return_logits)
+
+    raise ValueError("Only pipe parallel is supported by Times-NeoX")
+
+
+def get_model(neox_args, times_envelope, use_cache=False):
+    """Build the model."""
+
+    # Build model on cpu.
+    print_rank_0("building LagGPT model ...")
+
+    # Temporarily disable mup so that the base model does not use the mup init functions before set_base_shapes is called below.
+    # If mup isn't being used anyways, this has no effect.
+    old_use_mup = neox_args.use_mup
+    neox_args.use_mup = False
+
+    model = LagGPT(
+        neox_args=neox_args,
+        envelope=times_envelope,
+        num_tokentypes=0,
+        parallel_output=True,
+        topology=mpu.get_topology(),
+        use_cache=use_cache,
+    )
+
+    if not neox_args.is_pipe_parallel:
+        # Export PipeParallel model to nn.Sequential model to avoid the overhead of deepspeed's pipe parallel training
+        model = model.to_sequential()
+
+    neox_args.use_mup = old_use_mup
+
+    if neox_args.use_mup:
+        raise ValueError("MUP is not supported for times-neox")
+
+    if neox_args.deepspeed:
+        # DeepSpeed handles CUDA, FP16, and DDP components.
+        return model
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+
+def get_optimizer(model, neox_args):
+    """Set up the optimizer."""
+    if neox_args.no_load_optim:
+        return None, None
+
+    if neox_args.optimizer is None:
+        print_rank_0(
+            f"ERROR: Optimizer is None. Either set the optimizer dict in your config (if training) or set no_load_optim in your config (if inference)"
+        )
+        exit()
+    # Build parameter groups (weight decay and non-decay).
+    param_groups = get_params_for_weight_decay_optimization(model, neox_args)
+    print_rank_0(
+        f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}'
+    )
+
+    # Add model parallel attribute if it is not set.
+    for param_group in param_groups:
+        for param in param_group["params"]:
+            if not hasattr(param, "model_parallel"):
+                param.model_parallel = False
+
+    # Filter out params that don't require a grad (for soft prompt tuning, etc.)
+    _param_groups = []
+    for param_group in param_groups:
+        trainable_params = [p for p in param_group["params"] if p.requires_grad]
+        param_group["params"] = trainable_params
+        _param_groups.append(param_group)
+    param_groups = _param_groups
+
+    # If we're using mup, then the optimizer must be adam or sgd
+    assert not neox_args.use_mup or (
+        neox_args.optimizer_type.lower() == "adam"
+        or neox_args.optimizer_type.lower() == "sgd"
+    ), f"If use_mup == True, you must specify either the adam or sgd optimizers. You passed: {neox_args.optimizer_type.lower()}"
+
+    if neox_args.optimizer_type.lower() in ["cpu_adam", "cpu_torch_adam"]:
+        if neox_args.optimizer == "cpu_torch_adam":
+            cpu_adam_optimizer = torch.optim.Adam
+        else:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+            cpu_adam_optimizer = DeepSpeedCPUAdam
+        optimizer = cpu_adam_optimizer(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    elif neox_args.optimizer_type.lower() == "onebitadam":
+        assert neox_args.deepspeed
+        optimizer = None
+        # onebitadam needs to be instantiated within the deepspeed engine to work :|
+    elif neox_args.optimizer_type.lower() == "sm3":
+        from .optimizers import SM3
+
+        optimizer = SM3(param_groups, **neox_args.optimizer["params"])
+    elif neox_args.optimizer_type.lower() == "madgrad_wd":
+        from .optimizers import madgrad_wd
+
+        optimizer = madgrad_wd(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    elif neox_args.optimizer_type.lower() == "adam":
+        # Use Adam
+        if neox_args.use_mup:
+            try:
+                from mup import MuAdam
+
+                adam_optimizer = MuAdam
+            except ModuleNotFoundError:
+                print("Please install mup https://github.com/microsoft/mup")
+                raise Exception
+        else:
+            if neox_args.use_bnb_optimizer:
+                try:
+                    import bitsandbytes as bnb
+
+                    adam_optimizer = bnb.optim.Adam8bit
+                except ModuleNotFoundError:
+                    print(
+                        "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes."
+                    )
+                    raise Exception
+            else:
+                try:
+                    # default to apex as it's slightly faster
+                    from apex.optimizers import FusedAdam as Adam
+                except ImportError:
+                    # if apex isn't installed, use deepspeed's FusedAdam
+                    print(
+                        "WARNING: APEX not installed - defaulting to deepspeed's fused adam"
+                    )
+                    from deepspeed.ops.adam import FusedAdam as Adam
+                adam_optimizer = Adam
+        optimizer = adam_optimizer(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    elif neox_args.optimizer_type.lower() == "sgd":
+        try:
+            from mup import MuSGD
+        except ModuleNotFoundError:
+            print("Please install mup https://github.com/microsoft/mup")
+            raise Exception
+        optimizer = MuSGD(
+            param_groups,
+            weight_decay=neox_args.weight_decay,
+            **neox_args.optimizer["params"],
+        )
+    else:
+        raise ValueError(f"Optimizer type {neox_args.optimizer_type} not recognized")
+
+    if neox_args.deepspeed:
+        # fp16 wrapper is not required for DeepSpeed.
+        return optimizer, param_groups
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+
+def get_learning_rate_scheduler(optimizer, neox_args):
+    """Build the learning rate scheduler."""
+    if neox_args.no_load_optim:
+        # TODO: this should be configured as a separate arg
+        return None
+    if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam":
+        print_rank_0(
+            "WARNING: onebitadam requires the lr scheduler be built by deepspeed - "
+            "Make sure one is added to your deepspeed config"
+        )
+        return None
+
+    # Add linear learning rate scheduler.
+    if neox_args.lr_decay_iters is not None:
+        num_iters = neox_args.lr_decay_iters
+    else:
+        num_iters = neox_args.train_iters
+    num_iters = max(1, num_iters)
+    init_step = 0
+    warmup_iter = neox_args.warmup * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=neox_args.lr,
+        warmup_iter=warmup_iter,
+        total_iters=num_iters,
+        decay_style=neox_args.lr_decay_style,
+        last_iter=init_step,
+        min_lr=neox_args.min_lr,
+        use_checkpoint_lr_scheduler=neox_args.use_checkpoint_lr_scheduler,
+        override_lr_scheduler=neox_args.override_lr_scheduler,
+        use_mup=neox_args.use_mup,
+    )
+
+    return lr_scheduler
+
+
+def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
+    """Setup model and optimizer."""
+
+    # Create time-series envelope
+
+    times_args = neox_args.times_args
+    times_envelope = TransformerEnvelope(
+        times_args["context_length"], 
+        times_args["scaling"],
+        neox_args.hidden_size
+    )
+    times_args["past_length"] = times_envelope.past_length
+
+    model = get_model(neox_args=neox_args, times_envelope=times_envelope, use_cache=use_cache)
+    optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
+    lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
+
+    if neox_args.deepspeed:
+        print_rank_0("DeepSpeed is enabled.")
+        if neox_args.no_load_optim:
+            assert optimizer is None
+            _model_params = None
+            _lr_scheduler = None
+        else:
+            _model_params = param_groups if optimizer is None else None
+            _lr_scheduler = lr_scheduler
+        
+        model, optimizer, _, lr_scheduler = deepspeed.initialize(
+            model=model,
+            optimizer=optimizer,
+            args=neox_args,
+            lr_scheduler=_lr_scheduler,
+            dist_init_required=False,
+            model_parameters=_model_params,
+            # Need to remove the below so that it doesn't conflict with --deepspeed_config required by autotuning
+            # config_params=neox_args.deepspeed_config,
+            mpu=mpu if not neox_args.is_pipe_parallel else None,
+        )
+        #model.set_has_attention_mask(True)
+        model.total_params = get_total_params(model.module)
+        print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')
+
+        if neox_args.is_pipe_parallel:
+            #model.set_has_attention_mask(True)
+            if neox_args.curriculum_learning:
+                curr_scheduler = CurriculumScheduler(neox_args.curriculum_learning)
+                if iteration is not None and iteration > 0:
+                    curr_scheduler.update_difficulty(iteration)
+            else:
+                curr_scheduler = None
+            model.set_batch_fn(times_envelope.batch_fn)
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+    if neox_args.load is not None:
+        neox_args.iteration = load_checkpoint(
+            neox_args=neox_args,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            iteration=iteration,
+        )
+        print_rank_0(
+            f"Loading checkpoint and starting from iteration {neox_args.iteration}"
+        )
+    else:
+        neox_args.iteration = 0
+
+    return model, times_envelope, optimizer, lr_scheduler
+
+
+def backward_step(neox_args, timers, optimizer, model, loss):
+    """Backward step."""
+
+    # Backward pass.
+    timers("backward-backward").start()
+    if neox_args.deepspeed:
+        model.backward(loss)
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+    timers("backward-backward").stop()
+
+    if neox_args.deepspeed:
+        # DeepSpeed backward propagation already addressed all reduce communication.
+        # Reset the timer to avoid breaking timer logs below.
+        timers("backward-allreduce").reset()
+    else:
+        raise ValueError("Must be using deepspeed to run neox")
+
+
+def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler):
+    """Single training step."""
+
+    if data_iterator:
+        data_iterator.update_buffer()
+
+    # Pipeline parallelism schedules forward/backward/step
+    if neox_args.is_pipe_parallel:
+        reduced_loss = train_step_pipe(
+            neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator
+        )
+    else:
+        losses = []
+        for _ in range(neox_args.gradient_accumulation_steps):
+            # Forward model for one step.
+            timers("forward").start()
+            loss = forward_step(
+                neox_args=neox_args,
+                timers=timers,
+                data_iterator=data_iterator,
+                model=model,
+                is_train=True,
+            )
+            timers("forward").stop()
+            losses.append(loss)
+            # Calculate gradients, reduce across processes, and clip.
+            timers("backward").start()
+            backward_step(
+                neox_args=neox_args,
+                timers=timers,
+                optimizer=optimizer,
+                model=model,
+                loss=loss,
+            )
+            timers("backward").stop()
+            # Update parameters.
+            timers("optimizer").start()
+            if neox_args.deepspeed:
+                model.step()
+            else:
+                raise ValueError("Must be using deepspeed to run neox")
+            timers("optimizer").stop()
+        reduced_loss = {
+            "lm_loss": reduce_losses(losses).mean()
+        }  # reduces losses across machines for logging
+
+    if neox_args.precision == "fp16" and model.optimizer.overflow:
+        skipped_iter = 1
+    else:
+        skipped_iter = 0
+
+    return reduced_loss, skipped_iter
+
+
+def train_step_pipe(neox_args, timers, model, data_iterator):
+    """Single training step with DeepSpeed's pipeline parallel engine."""
+
+    assert neox_args.deepspeed
+    loss = model.train_batch(data_iter=data_iterator)
+    loss_dict = {"lm_loss": loss}
+    # Don't break Megatron's timers because we changed code paths.
+    for t in [
+        "forward",
+        "backward",
+        "allreduce",
+        "optimizer",
+        "batch generator",
+        "data loader",
+    ]:
+        timers(t).reset()
+    return loss_dict
+
+
+def train(
+    neox_args,
+    timers,
+    model,
+    optimizer,
+    lr_scheduler,
+    train_data_iterator,
+    valid_data_iterator,
+):
+    """Train the model function."""
+    
+    #import warnings
+    #warnings.filterwarnings("error")
+
+    # Turn on training mode which enables dropout.
+    model.train()
+
+    # Tracking loss.
+    total_loss_dict = {}
+    
+    # Iterations.
+    iteration = neox_args.iteration
+
+    timers("interval time").start()
+    report_memory_flag = True
+
+    # get noise scale logger (if neox_args.log_gradient_noise_scale is True)
+    noise_scale_logger = get_noise_scale_logger(neox_args)
+
+    # to monitor if we've skipped many iterations in a row and trigger an early exit
+    overflow_monitor = OverflowMonitor(optimizer)
+    while iteration < neox_args.train_iters:
+        loss_dict, skipped_iter = train_step(
+            neox_args=neox_args,
+            timers=timers,
+            data_iterator=train_data_iterator,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+        )
+        iteration += 1
+        neox_args.iteration = iteration
+        if neox_args.precision == "fp16":
+            overflow_monitor.check(skipped_iter)  # check for repeated overflow
+        if neox_args.log_gradient_noise_scale:  # log noise scale if applicable
+            noise_scale_logger.update()
+
+        # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
+        # may have no tunable parameters on a specific rank
+        if optimizer.param_groups:
+            lr = optimizer.param_groups[0].get("lr", 0)
+        else:
+            lr = 0
+
+        # Logging.
+        report_memory_flag = training_log(
+            neox_args=neox_args,
+            timers=timers,
+            loss_dict=loss_dict,
+            total_loss_dict=total_loss_dict,
+            learning_rate=lr,
+            iteration=iteration,
+            loss_scale=optimizer.cur_scale if neox_args.precision == "fp16" else None,
+            report_memory_flag=report_memory_flag,
+            skipped_iter=skipped_iter,
+            model=model,
+            optimizer=optimizer,
+            noise_scale_logger=noise_scale_logger,
+        )
+
+        # Checkpointing
+        if neox_args.save and iteration in neox_args.save_iters:
+            save_checkpoint(
+                neox_args=neox_args,
+                iteration=iteration,
+                model=model,
+                optimizer=optimizer,
+                lr_scheduler=lr_scheduler,
+            )
+
+        # Evaluation
+        if (
+            neox_args.eval_interval
+            and iteration % neox_args.eval_interval == 0
+            and neox_args.do_valid
+        ):
+            prefix = "iteration {}".format(iteration)
+            evaluate_and_print_results(
+                neox_args=neox_args,
+                prefix=prefix,
+                forward_step_func=forward_step,
+                data_iterator=valid_data_iterator,
+                model=model,
+                iteration=iteration,
+                verbose=False,
+                timers=timers,
+            )
+
+        if neox_args.exit_interval and iteration % neox_args.exit_interval == 0:
+            torch.distributed.barrier()
+            time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            rank = torch.distributed.get_rank()
+            print_rank_0(
+                "rank: {} | time: {} | exiting the program at iteration {}".format(
+                    rank, time_str, iteration
+                )
+            )
+            sys.exit()
+
+    return iteration
+
+
+def evaluate(
+    neox_args, forward_step_fn, data_iterator, model, verbose=False, timers=None
+):
+    """Evaluation.
+    neox_args: NeoX Arguments
+    forward_step_fn: function with args `neox_args, timers,
+                    data_iterator & model that will run a forward pass on the model
+    data_iterator: Iterator that iterates over batches of data. Should return data in the form:
+                    {'text': np.array([tokens], dtype=np.int64)}
+                    where the size of the array is the model's context size + 1
+                    (`get_batch` transforms it into inputs / labels)
+    """
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    losses = []
+    with torch.no_grad():
+        iteration = 0
+        while iteration < neox_args.eval_iters:
+            # Distribute data between GPUs for laggpt
+            if data_iterator:
+                data_iterator.update_buffer()
+
+            iteration += 1
+            if verbose and iteration % neox_args.log_interval == 0:
+                print_rank_0(
+                    "Evaluating iter {}/{}".format(iteration, neox_args.eval_iters)
+                )
+
+            # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
+            # to be consistent with deepspeed's pipe parallel engine
+            # since pipe parallel already takes gas into account - default to 1 here if pipe parallel is true
+            for _ in range(
+                1
+                if neox_args.is_pipe_parallel
+                else neox_args.gradient_accumulation_steps
+            ):
+                # Forward evaluation
+                loss = forward_step_fn(
+                    model=model,
+                    data_iterator=data_iterator,
+                    neox_args=neox_args,
+                    timers=timers,
+                )
+                losses.append(loss)
+
+            # When contiguous memory optimizations are enabled, the buffers
+            # allocated by the optimizations are deallocated during backward pass
+            # in the absence of backward pass the buffers should be reset after each
+            # forward pass
+            if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
+                deepspeed.checkpointing.reset()
+
+    # reduces losses across processes for logging & run eval harness tasks
+    eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
+    eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])
+
+    # Move model back to the train mode.
+    model.train()
+    return eval_results
+
+
+def evaluate_and_print_results(
+    neox_args,
+    prefix,
+    forward_step_func,
+    data_iterator,
+    model,
+    iteration,
+    verbose=False,
+    timers=None,
+    chart_name="validation",
+):
+    if data_iterator:
+        data_iterator.update_buffer()
+
+    """Helper function to evaluate and dump results on screen."""
+    total_loss_dict = evaluate(
+        neox_args=neox_args,
+        forward_step_fn=forward_step_func,
+        data_iterator=data_iterator,
+        model=model,
+        verbose=verbose,
+        timers=timers,
+    )
+    string = f" {chart_name} results at {prefix} | "
+    for k, v in total_loss_dict.items():
+        if isinstance(v, dict):
+            for k2, v2 in v.items():
+                k3 = "_".join([k, k2])
+                string += f"{k3} value: {v2:.6E} | "
+                tb_wandb_log(
+                    f"{chart_name}/{k3}",
+                    v2,
+                    iteration,
+                    use_wandb=neox_args.use_wandb,
+                    tensorboard_writer=neox_args.tensorboard_writer,
+                )
+        else:
+            string += f"{k} value: {v:.6E} | "
+            tb_wandb_log(
+                f"{chart_name}/{k}",
+                v,
+                iteration,
+                use_wandb=neox_args.use_wandb,
+                tensorboard_writer=neox_args.tensorboard_writer,
+            )
+
+    length = len(string) + 1
+    print_rank_0("-" * length)
+    print_rank_0(string)
+    print_rank_0("-" * length)
diff --git a/megatron/utils.py b/megatron/utils.py
index 0071ef87..24b3a23e 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -416,7 +416,7 @@ def setup_for_inference_or_eval(
 
     from megatron.neox_arguments import NeoXArgs
     from megatron.initialize import initialize_megatron
-    from megatron.training import setup_model_and_optimizer
+    from megatron.trainingGPT import setup_model_and_optimizer
 
     _overwrite_values = {
         "checkpoint_activations": False,
diff --git a/preload_gluonts_datasets.py b/preload_gluonts_datasets.py
new file mode 100755
index 00000000..ad64bed8
--- /dev/null
+++ b/preload_gluonts_datasets.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+from gluonts.dataset.repository.datasets import get_dataset
+import argparse, json, os, re
+
+parser = argparse.ArgumentParser("Preload GluonTS datasets")
+parser.add_argument("filename", help="Config file path", type=str)
+args = parser.parse_args()
+filename = args.filename
+
+if not os.path.exists(filename):
+    raise FileNotFoundError(f"File {filename} does not exist.")
+
+with open(filename, "r") as file:
+    content = file.read()
+    obj = json.loads(re.sub("^\s*\#.*", "", content, flags=re.MULTILINE))
+
+
+datasets = obj["times_args"]["datasets"]
+
+to_load = []
+if "train" in datasets:
+    to_load.extend(datasets["train"])
+if "validation" in datasets:
+    to_load.extend(datasets["validation"])
+if "test" in datasets:
+    to_load.extend(datasets["test"])
+
+for i in to_load:
+    get_dataset(i)
\ No newline at end of file
diff --git a/tests/topology/test_topology.py b/tests/topology/test_topology.py
new file mode 100644
index 00000000..2b9b9434
--- /dev/null
+++ b/tests/topology/test_topology.py
@@ -0,0 +1,156 @@
+from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+
+n_pipe = 10
+n_model = 5
+n_data = 2
+size = n_pipe * n_model * n_data
+
+topo = PipeModelDataParallelTopology(num_pp=n_pipe, num_mp=n_model, num_dp=n_data)
+
+
+def get_model_parallel_src_rank(rank, model_size):
+    """Calculate the global rank corresponding to a local rank zero
+    in the model parallel group."""
+
+    global_rank = rank
+    local_world_size = model_size
+    return (global_rank // local_world_size) * local_world_size
+
+
+groups = []
+srcs  = []
+n_pipe = topo.get_dim("pipe")
+for i in range(topo.get_dim("data")):
+    first = topo.filter_match(pipe = 0, data = i)
+    last = topo.filter_match(pipe = n_pipe - 1, data = i)
+    src = topo.get_rank(pipe = 0, model = 0, data = i)
+    group = first + last
+    groups.append(group)
+    srcs.append(src)
+
+print(groups)
+print(srcs)
+print(get_model_parallel_src_rank(rank = 7, model_size=n_model))
+
+
+
+
+def _prepare_size_buffer(keys, data, src):
+
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if src:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    return sizes_cuda
+
+
+def unpack_size(keys, sizes_cuda):
+
+    max_dim = _MAX_DATA_DIM
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def _prepare_data_buffer(keys, data, datatype, total_numel, src):
+    if src:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0
+        ).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype
+        )
+
+    return flatten_data
+
+
+def _unpack_data_buffer(flatten_data, keys, key_size, key_numel):
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
+
+
+def _cross_broadcast(buffer, model_parallel_world_size, src):
+    if src:
+        torch.distributed.isend(
+            buffer, get_pipe_parallel_last_rank(), 
+            group = get_pipe_parallel_group())
+    else:
+        torch.distributed.recv(
+            buffer, get_pipe_parallel_src_rank(),
+            group = get_pipe_parallel_group())
+    
+    if model_parallel_world_size > 1:
+        torch.distributed.broadcast(
+            buffer, get_model_parallel_src_rank(), group=get_model_parallel_group()
+        )
+
+
+def broadcast_data_ext(keys, data, datatype):
+    
+    pipe_parallel_world_size = get_pipe_parallel_world_size()
+    model_parallel_world_size = get_model_parallel_world_size()
+
+    if model_parallel_world_size < 2 and pipe_parallel_world_size < 2:
+        for i in data.keys():
+            data[i] = data[i].cuda()
+        return data
+
+    if pipe_parallel_world_size < 2:
+        return broadcast_data(keys, data, datatype)
+
+    pipe_parallel_rank = get_pipe_parallel_rank()
+    model_parallel_rank = get_model_parallel_rank()
+
+
+
+    src = (model_parallel_rank == 0 and pipe_parallel_rank == 0)
+    sizes_cuda = _prepare_size_buffer(keys, data, src)
+
+    _cross_broadcast(sizes_cuda, model_parallel_world_size, src)
+
+    key_size, key_numel, total_numel = unpack_size(keys, sizes_cuda)
+    flatten_data = _prepare_data_buffer(keys, data, datatype, total_numel, src)
+
+    _cross_broadcast(flatten_data, model_parallel_world_size, src)
+
+    return _unpack_data_buffer(flatten_data, keys, key_size, key_numel)
diff --git a/tools/print_zarr.py b/tools/print_zarr.py
new file mode 100755
index 00000000..2e87c351
--- /dev/null
+++ b/tools/print_zarr.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+import matplotlib.pyplot as plt
+import numpy as np
+import argparse, os
+from matplotlib.backends.backend_pdf import PdfPages
+import zarr
+
+
+def make_figs(group, pdf):
+    ground_truth = group["ground_truth"][:]
+    prediction = group["future"][:]
+    past = group["past_target"][:, -context_length:]
+    ground_truth_series = np.hstack([past, ground_truth])
+    prediction_series = np.hstack([past, prediction])
+
+    for gt, pr, gt_series, pr_series in zip(ground_truth, prediction, ground_truth_series, prediction_series): 
+        fig, (ax1, ax2) = plt.subplots(nrows = 2, ncols = 1)
+
+        ax2.plot(gt)
+        ax2.plot(pr)
+        ax1.plot(gt_series)
+        ax1.plot(pr_series)
+        ax1.axvline(context_length -1, color="r")
+        fig.savefig(pdf, format='pdf') 
+        
+        plt.close(fig)
+
+
+parser = argparse.ArgumentParser(
+    prog='Time series visualization',
+    description='Create PDF with predicted time series')
+
+parser.add_argument(
+    "-c", "--context_length", type = int, default = 100, 
+    help = "length of the context to visualize")
+parser.add_argument(
+    "-i", "--input", type = str, required=True, 
+    help = "Name of input zarr file")
+parser.add_argument(
+    "-p", "--pdf", type = str, required=True, 
+    help = "Name of output PDF file.")
+args = parser.parse_args()
+
+
+context_length = args.context_length
+input_filename = args.input
+output_filename = args.pdf
+
+if not os.path.exists(input_filename):
+    raise FileExistsError(f"File {input_filename} does not exists, use --input option to set input file name.")
+
+file = zarr.open(input_filename, "r")
+pdf = PdfPages(output_filename)
+
+for i in file.group_keys():
+    make_figs(file[i], pdf)
+
+pdf.close()
diff --git a/train-times.py b/train-times.py
new file mode 100644
index 00000000..b5225635
--- /dev/null
+++ b/train-times.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Train"""
+from megatron.neox_arguments import NeoXArgs
+from megatron.trainingTIMES import pretrain
+#from megatron.trainingGPT import pretrain
+
+if __name__ == "__main__":
+    neox_args = NeoXArgs.consume_neox_args()
+    neox_args.configure_distributed_args()
+    if not neox_args.times_args:
+        neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
+    neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
+    pretrain(neox_args=neox_args)
diff --git a/train.py b/trainGPT.py
similarity index 84%
rename from train.py
rename to trainGPT.py
index 358ab3a8..12b23cc3 100644
--- a/train.py
+++ b/trainGPT.py
@@ -17,11 +17,12 @@
 
 """Train"""
 from megatron.neox_arguments import NeoXArgs
-from megatron.training import pretrain
+from megatron.trainingGPT import pretrain
 
 if __name__ == "__main__":
     neox_args = NeoXArgs.consume_neox_args()
     neox_args.configure_distributed_args()
-    neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
+    if not neox_args.times_args:
+        neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
     neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
     pretrain(neox_args=neox_args)
diff --git a/train_script.sh b/train_script.sh
new file mode 100755
index 00000000..7e9e29eb
--- /dev/null
+++ b/train_script.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Runs the "49M" parameter model
+
+# asynio flags
+export LDFLAGS="$LDFLAGS -L/usr/lib64/"
+export CFLAGS="$CFLAGS -I/usr/include/"
+# c++ libs
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/vgurev/.conda/envs/GPT/x86_64-conda-linux-gnu/lib/
+export PATH=/data/vgurev/.conda/envs/GPT/bin/:$PATH
+
+#use mpirun, not pytorch luncher
+export MPI=TRUE
+
+GPUS_PER_NODE=2
+NNODES=1
+export WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+python ./deepy.py train-times.py 49M.yml
+