-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathoptim_util.py
62 lines (51 loc) · 2.35 KB
/
optim_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import numpy as np
import torch
import torch.nn as nn
import torch.nn.utils as utils
def bits_per_dim(x, nll):
"""Get the bits per dimension implied by using model with `loss`
for compressing `x`, assuming each entry can take on `k` discrete values.
Args:
x (torch.Tensor): Input to the model. Just used for dimensions.
nll (torch.Tensor): Scalar negative log-likelihood loss tensor.
Returns:
bpd (torch.Tensor): Bits per dimension implied if compressing `x`.
"""
dim = np.prod(x.size()[1:])
bpd = nll / (np.log(2) * dim)
return bpd
def clip_grad_norm(optimizer, max_norm, norm_type=2):
"""Clip the norm of the gradients for all parameters under `optimizer`.
Args:
optimizer (torch.optim.Optimizer):
max_norm (float): The maximum allowable norm of gradients.
norm_type (int): The type of norm to use in computing gradient norms.
"""
for group in optimizer.param_groups:
utils.clip_grad_norm_(group['params'], max_norm, norm_type)
def plot_grad_flow(named_parameters):
'''Plots the gradients flowing through different layers in the net during training.
Can be used for checking for possible gradient vanishing / exploding problems.
Usage: Plug this function in Trainer class after loss.backwards() as
"plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
ave_grads = []
max_grads= []
layers = []
for n, p in named_parameters:
if(p.requires_grad) and ("bias" not in n):
layers.append(n)
ave_grads.append(p.grad.abs().mean())
max_grads.append(p.grad.abs().max())
plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
plt.xlim(left=0, right=len(ave_grads))
plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
plt.xlabel("Layers")
plt.ylabel("average gradient")
plt.title("Gradient flow")
plt.grid(True)
plt.legend([Line2D([0], [0], color="c", lw=4),
Line2D([0], [0], color="b", lw=4),
Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])