diff --git a/configs/_base_/schedules/schedule_1x.py b/configs/_base_/schedules/schedule_1x.py
index dbe49db56e6..dac201eb770 100644
--- a/configs/_base_/schedules/schedule_1x.py
+++ b/configs/_base_/schedules/schedule_1x.py
@@ -17,4 +17,6 @@
 ]
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/_base_/schedules/schedule_20e.py b/configs/_base_/schedules/schedule_20e.py
index 04163c21030..704fc326ece 100644
--- a/configs/_base_/schedules/schedule_20e.py
+++ b/configs/_base_/schedules/schedule_20e.py
@@ -17,4 +17,6 @@
 ]
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/_base_/schedules/schedule_2x.py b/configs/_base_/schedules/schedule_2x.py
index e07b3a31e63..7c2a22a793e 100644
--- a/configs/_base_/schedules/schedule_2x.py
+++ b/configs/_base_/schedules/schedule_2x.py
@@ -17,4 +17,6 @@
 ]
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/atss/atss_r50_fpn_1x_coco.py b/configs/atss/atss_r50_fpn_1x_coco.py
index 9f5abea1903..abe05f7daa4 100644
--- a/configs/atss/atss_r50_fpn_1x_coco.py
+++ b/configs/atss/atss_r50_fpn_1x_coco.py
@@ -67,4 +67,5 @@
         nms=dict(type='nms', iou_threshold=0.6),
         max_per_img=100))
 # optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
index b3b7bef6c36..a8831aa4a12 100644
--- a/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
+++ b/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
@@ -22,7 +22,7 @@
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 
 # training schedule for 2x
-train_cfg = dict(by_epoch=True, max_epochs=24)
+train_cfg = dict(max_epochs=24)
 
 # learning rate
 param_scheduler = [
diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py
index 12015b3bf33..bb61beb5bd3 100644
--- a/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py
@@ -74,8 +74,9 @@
 ]
 
 # optimizer
-optimizer = dict(
-    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optim_wrapper = dict(
+    optimizer=dict(
+        lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)))
 
 default_hooks = dict(
     optimizer=dict(
diff --git a/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py
index ebf7fd32fa8..ff7491e905d 100644
--- a/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py
+++ b/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py
@@ -36,7 +36,7 @@
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 
 # training schedule for 2x
-train_cfg = dict(by_epoch=True, max_epochs=24)
+train_cfg = dict(max_epochs=24)
 
 # learning rate
 param_scheduler = [
diff --git a/configs/fsaf/fsaf_r50_fpn_1x_coco.py b/configs/fsaf/fsaf_r50_fpn_1x_coco.py
index 5edc60b98fa..4ff65128253 100644
--- a/configs/fsaf/fsaf_r50_fpn_1x_coco.py
+++ b/configs/fsaf/fsaf_r50_fpn_1x_coco.py
@@ -43,7 +43,10 @@
         allowed_border=-1,
         pos_weight=-1,
         debug=False))
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
 default_hooks = dict(
     optimizer=dict(
         _delete_=True,
diff --git a/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
index 2d74a7c73ef..91279412247 100644
--- a/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -70,5 +70,6 @@
 train_dataloader = dict(batch_size=4, num_workers=2)
 
 # optimizer
-optimizer = dict(
-    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optim_wrapper = dict(
+    optimizer=dict(
+        lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)))
diff --git a/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
index 7fa63746656..8e4b3caf376 100644
--- a/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -69,5 +69,6 @@
 train_dataloader = dict(batch_size=4, num_workers=2)
 
 # optimizer
-optimizer = dict(
-    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optim_wrapper = dict(
+    optimizer=dict(
+        lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)))
diff --git a/configs/paa/paa_r50_fpn_1x_coco.py b/configs/paa/paa_r50_fpn_1x_coco.py
index 4c9c4aa73e1..d70cd4bb48d 100644
--- a/configs/paa/paa_r50_fpn_1x_coco.py
+++ b/configs/paa/paa_r50_fpn_1x_coco.py
@@ -67,4 +67,6 @@
         nms=dict(type='nms', iou_threshold=0.6),
         max_per_img=100))
 # optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py b/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py
index 6bbcac4fa4f..683b1b76638 100644
--- a/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py
+++ b/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py
@@ -3,4 +3,5 @@
 ]
 # optimizer
 model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py b/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py
index b73772ce053..d2e88d68e33 100644
--- a/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py
+++ b/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py
@@ -15,8 +15,8 @@
     neck=dict(in_channels=[64, 128, 256, 512]))
 
 # Note: If the learning rate is set to 0.0025, the mAP will be 32.4.
-optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)
-
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))
 # TODO: support auto scaling lr
 # NOTE: `auto_scale_lr` is for automatically scaling LR,
 # USER SHOULD NOT CHANGE ITS VALUES.
diff --git a/configs/retinanet/retinanet_r18_fpn_1x_coco.py b/configs/retinanet/retinanet_r18_fpn_1x_coco.py
index a05557e843f..96021180675 100644
--- a/configs/retinanet/retinanet_r18_fpn_1x_coco.py
+++ b/configs/retinanet/retinanet_r18_fpn_1x_coco.py
@@ -10,7 +10,8 @@
         depth=18,
         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
     neck=dict(in_channels=[64, 128, 256, 512]))
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
 
 # TODO: support auto scaling lr
 # NOTE: `auto_scale_lr` is for automatically scaling LR,
diff --git a/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/configs/retinanet/retinanet_r50_fpn_1x_coco.py
index c64bdf87769..b09679ccc2e 100644
--- a/configs/retinanet/retinanet_r50_fpn_1x_coco.py
+++ b/configs/retinanet/retinanet_r50_fpn_1x_coco.py
@@ -5,4 +5,5 @@
 ]
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/retinanet/retinanet_r50_fpn_2x_coco.py b/configs/retinanet/retinanet_r50_fpn_2x_coco.py
index 43dcede71d5..47511b78ed2 100644
--- a/configs/retinanet/retinanet_r50_fpn_2x_coco.py
+++ b/configs/retinanet/retinanet_r50_fpn_2x_coco.py
@@ -21,4 +21,5 @@
 ]
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py b/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py
index 02a2c291631..bc498830ff2 100644
--- a/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py
+++ b/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py
@@ -2,4 +2,5 @@
     '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py'
 ]
 # optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py b/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py
index f6ab512f182..9dc6459f062 100644
--- a/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py
+++ b/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py
@@ -5,4 +5,4 @@
 model = dict(
     pretrained='open-mmlab://resnext101_64x4d',
     backbone=dict(type='ResNeXt', depth=101, groups=64, base_width=4))
-optimizer = dict(type='SGD', lr=0.01)
+optim_wrapper = dict(optimizer=dict(type='SGD', lr=0.01))
diff --git a/configs/ssd/ssd300_coco.py b/configs/ssd/ssd300_coco.py
index 7e4ad1625e0..02079aff294 100644
--- a/configs/ssd/ssd300_coco.py
+++ b/configs/ssd/ssd300_coco.py
@@ -65,7 +65,8 @@
 test_dataloader = val_dataloader
 
 # optimizer
-optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4))
 
 custom_hooks = [
     dict(type='NumClassCheckHook'),
diff --git a/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py b/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
index d3e4c25e6ef..87841cbaa75 100644
--- a/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
+++ b/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
@@ -149,7 +149,8 @@
 ]
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5))
 
 custom_hooks = [
     dict(type='NumClassCheckHook'),
diff --git a/configs/tood/tood_r50_fpn_1x_coco.py b/configs/tood/tood_r50_fpn_1x_coco.py
index 7f23975e983..8960841fc25 100644
--- a/configs/tood/tood_r50_fpn_1x_coco.py
+++ b/configs/tood/tood_r50_fpn_1x_coco.py
@@ -76,4 +76,5 @@
         nms=dict(type='nms', iou_threshold=0.6),
         max_per_img=100))
 # optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))