Updating default configs to be less bad (#665)

* Update and rename small.yml to 125M.yml * Update 13B.yml * Update 2-7B.yml * Update 13B.yml * Update 6-7B.yml * Update and rename XL.yml to 1-3B.yml * Update 175B.yml * Update eleutherai_cluster.yml * Update eleutherai_cluster.yml * Update 125M.yml * Update 2-7B.yml * Update 1-3B.yml * Update 1-3B.yml * Update 125M.yml * Update 13B.yml * Update 1-3B.yml * Update 125M.yml * Update 175B.yml * Update 2-7B.yml * Update 6-7B.yml * Update medium.yml * Rename medium.yml to 350M.yml * Update and rename large.yml to 760M.yml * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Create 19M.yml * Create 800M.yml * Update 19M.yml * Create 49M.yml * Update NeoXArgs docs automatically Co-authored-by: Stella Biderman <[email protected]> Co-authored-by: github-actions <[email protected]> Co-authored-by: Hailey Schoelkopf <[email protected]> Co-authored-by: Quentin Anthony <[email protected]>
IBM · Nov 18, 2022 · fe21c3e · fe21c3e
1 parent b1c74f3
commit fe21c3e
Show file tree

Hide file tree

Showing 13 changed files with 380 additions and 48 deletions.
diff --git a/configs/XL.yml → configs/1-3B.yml b/configs/XL.yml → configs/1-3B.yml
@@ -14,20 +14,28 @@
    "norm": "layernorm",
    "pos-emb": "rotary",
    "no-weight-tying": true,
-
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled-upper-triang-masked-softmax-fusion": false,
    "bias-gelu-fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
 
    # optimizer settings
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.0002,
-       "betas": [0.9, 0.999],
+       "betas": [0.9, 0.95],
        "eps":  1.0e-8,
      }
    },
+   "min_lr": 0.00002,
+
    "zero_optimization": {
     "stage": 1,
     "allgather_partitions": True,
@@ -42,7 +50,6 @@
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
-   "split": "949,50,1",
 
    # activation checkpointing
    "checkpoint-activations": true,
@@ -52,7 +59,7 @@
 
    # regularization
    "gradient_clipping": 1.0,
-   "weight-decay": 0,
+   "weight-decay": 0.1,
    "hidden-dropout": 0,
    "attention-dropout": 0,
 

diff --git a/configs/small.yml → configs/125M.yml b/configs/small.yml → configs/125M.yml
@@ -14,23 +14,31 @@
    "norm": "layernorm",
    "pos-emb": "rotary",
    "no-weight-tying": true,
-
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled-upper-triang-masked-softmax-fusion": false,
    "bias-gelu-fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
 
 
    # optimizer settings
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.0006,
-       "betas": [0.9, 0.999],
+       "betas": [0.9, 0.95],
        "eps": 1.0e-8,
      }
    },
+   "min_lr": 0.00006,
+
    "zero_optimization": {
-    "stage": 0,
+    "stage": 1,
     "allgather_partitions": True,
     "allgather_bucket_size": 500000000,
     "overlap_comm": True,
@@ -43,7 +51,6 @@
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
-   "split": "949,50,1",
 
    # activation checkpointing
    "checkpoint-activations": true,
@@ -53,7 +60,7 @@
 
    # regularization
    "gradient_clipping": 1.0,
-   "weight-decay": 0.0,
+   "weight-decay": 0.1,
    "hidden-dropout": 0.0,
    "attention-dropout": 0.0,
 

diff --git a/configs/13B.yml b/configs/13B.yml
@@ -14,17 +14,24 @@
    "norm": "layernorm",
    "pos-emb": "rotary",
    "no-weight-tying": true,
-
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled-upper-triang-masked-softmax-fusion": false,
    "bias-gelu-fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
 
    # optimizer settings
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.0001,
-       "betas": [0.9, 0.999],
+       "betas": [0.9, 0.95],
        "eps": 1.0e-8,
      }
    },
@@ -38,11 +45,11 @@
     "contiguous_gradients": True,
     "cpu_offload": False
   },
-
+   "min_lr": 0.00001,
+
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
-   "split": "949,50,1",
 
    # activation checkpointing
    "checkpoint-activations": true,
@@ -52,7 +59,7 @@
 
    # regularization
    "gradient_clipping": 1.0,
-   "weight-decay": 0,
+   "weight-decay": 0.1,
    "hidden-dropout": 0,
    "attention-dropout": 0,
 

diff --git a/configs/175B.yml b/configs/175B.yml
@@ -14,20 +14,27 @@
    "norm": "layernorm",
    "pos-emb": "rotary",
    "no-weight-tying": true,
-
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled-upper-triang-masked-softmax-fusion": false,
    "bias-gelu-fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
 
    # optimizer settings
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.00006,
-       "betas": [0.9, 0.999],
+       "betas": [0.9, 0.95],
        "eps": 1.0e-8,
      }
    },
+   "min_lr": 0.000006,
    "zero_optimization": {
     "stage": 1,
     "allgather_partitions": True,
@@ -42,7 +49,6 @@
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
-   "split": "949,50,1",
 
    # activation checkpointing
    "checkpoint-activations": true,
@@ -52,7 +58,7 @@
 
    # regularization
    "gradient_clipping": 1.0,
-   "weight-decay": 0,
+   "weight-decay": 0.1,
    "hidden-dropout": 0,
    "attention-dropout": 0,
 

diff --git a/configs/19M.yml b/configs/19M.yml
@@ -0,0 +1,84 @@
+{
+  "pipe-parallel-size": 1,
+  "model-parallel-size": 1,
+
+  # model settings
+  "num-layers": 6,
+  "hidden-size": 512,
+  "num-attention-heads": 8,
+  "seq-length": 2048,
+  "max-position-embeddings": 2048,
+  "pos-emb": "rotary",
+  "no-weight-tying": true,
+  "gpt-j-residual": false,
+  "output-layer-parallelism": "column",
+
+  "scaled-upper-triang-masked-softmax-fusion": false,
+  "bias-gelu-fusion": false,
+
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.0001,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+    "cpu_offload": False
+  },
+
+  "train_micro_batch_size_per_gpu": 4, #32,
+  "gas": 1,
+  "data-impl": "mmap",
+  "num_workers": 1,
+
+  # activation checkpointing
+  "checkpoint-activations": true,
+  "checkpoint-num-layers": 1,
+  "partition-activations": true,
+  "synchronize-each-layer": true,
+
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight-decay": 0.1,
+  "hidden-dropout": 0,
+  "attention-dropout": 0,
+
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+
+  "train-iters": 143000,
+  "lr-decay-iters": 143000,
+  "distributed-backend": "nccl",
+  "lr-decay-style": "cosine",
+  "warmup": 0.01,
+  "save-interval": 1000,
+  "eval-interval": 100000,
+  "eval-iters": 10,
+
+  "log-interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+}
diff --git a/configs/2-7B.yml b/configs/2-7B.yml
@@ -14,21 +14,27 @@
    "norm": "layernorm",
    "pos-emb": "rotary",
    "no-weight-tying": true,
-
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled-upper-triang-masked-softmax-fusion": false,
    "bias-gelu-fusion": false,
-
-
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
    # optimizer settings
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.00016,
-       "betas": [0.9, 0.999],
+       "betas": [0.9, 0.95],
        "eps": 1.0e-8,
      }
    },
+   "min_lr": 0.000016,
    "zero_optimization": {
     "stage": 1,
     "allgather_partitions": True,
@@ -43,7 +49,6 @@
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
-   "split": "949,50,1",
 
    # activation checkpointing
    "checkpoint-activations": true,
@@ -53,7 +58,7 @@
 
    # regularization
    "gradient_clipping": 1.0,
-   "weight-decay": 0,
+   "weight-decay": 0.1,
    "hidden-dropout": 0,
    "attention-dropout": 0,
 

diff --git a/configs/medium.yml → configs/350M.yml b/configs/medium.yml → configs/350M.yml
@@ -14,22 +14,27 @@
    "norm": "layernorm",
    "pos-emb": "rotary",
    "no-weight-tying": true,
-
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled-upper-triang-masked-softmax-fusion": false,
    "bias-gelu-fusion": false,
-
-
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
 
    # optimizer settings
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.0003,
-       "betas": [0.9, 0.999],
+       "betas": [0.9, 0.95],
        "eps": 1.0e-8,
      }
    },
+   "min_lr": 0.00003,
    "zero_optimization": {
     "stage": 1,
     "allgather_partitions": True,
@@ -43,7 +48,6 @@
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
-   "split": "949,50,1",
 
    # activation checkpointing
    "checkpoint-activations": true,
@@ -53,7 +57,7 @@
 
    # regularization
    "gradient_clipping": 1.0,
-   "weight-decay": 0,
+   "weight-decay": 0.1,
    "hidden-dropout": 0,
    "attention-dropout": 0,