Pytorch中权值初始化和损失函数

Pytorch中权值初始化和损失函数

权值初始化

梯度消失与爆炸

针对上面这个两个隐藏层的神经网络,我们求w2的梯度

可以发现,w2的梯度与H1(上一层网络的输出)有很大的关系,当h1趋近于0时,w2的梯度也趋近于0,当h1趋近于无穷大时,w2的梯度也趋近于无穷大。

一旦发生梯度消失或梯度爆炸,那么网络将无法训练,为了避免梯度消失或梯度爆炸的出现,需要控制网络输出值的范围(不能太大也不能太小)

使用下述代码举例,可以发现网络在第30层时,网络的输出值就达到了无穷大

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import torch.nn as nn
import torch


class MLP(nn.Module):
def __init__(self, neural_num, layers):
super(MLP, self).__init__()
self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
self.neural_num = neural_num

def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)

print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
return x

def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight.data) # 权值初始化,normal:mean=0, std=1


layer_nums = 100
neural_nums = 256
batch_size = 16

net = MLP(neural_nums, layer_nums)
net.initialize()

inputs = torch.randn((batch_size, neural_nums)) # normal:mean=0, std=1

output = net(inputs)
print(output)


layer:0, std:16.192399978637695
layer:1, std:257.58282470703125
layer:2, std:4023.779296875
layer:3, std:63778.1484375
layer:4, std:1018549.25
layer:5, std:16114499.0
layer:6, std:261527760.0
layer:7, std:4343016960.0
layer:8, std:68217614336.0
layer:9, std:1059127361536.0
layer:10, std:16691895468032.0
layer:11, std:273129754591232.0
layer:12, std:4324333061144576.0
layer:13, std:6.819728172725043e+16
layer:14, std:1.0919447200341688e+18
layer:15, std:1.7315714945123353e+19
layer:16, std:2.7080612511527574e+20
layer:17, std:4.2597083154501536e+21
layer:18, std:6.901826777180292e+22
layer:19, std:1.0874986135399613e+24
layer:20, std:1.7336039095836894e+25
layer:21, std:2.721772446955186e+26
layer:22, std:4.3424048756579536e+27
layer:23, std:6.735045216945116e+28
layer:24, std:1.0334125335235665e+30
layer:25, std:1.6612993331149975e+31
layer:26, std:2.564641346528178e+32
layer:27, std:4.073855480726675e+33
layer:28, std:6.289443501272203e+34
layer:29, std:1.002706158037316e+36
layer:30, std:nan
output is nan in 30 layers

下面我们通过方差公式推导这一现象出现的原因:

X和Y表示相互独立的两个变量,且符合均值为0(期望也为0),标准差为1的分布;E表示期望,D表示方差。

关于均值和期望的关系参考文章:https://mp.weixin.qq.com/s?__biz=MzI2MjE3OTA1MA==&mid=2247490449&idx=1&sn=76d7ebb344c1f866308324c0e88bea2a&chksm=ea4e4a14dd39c302d6ad1a559532248466505fc01f099b71a5f7932595cba3bada08e9687077&scene=27

关于图中公式的推导参考文章:https://zhuanlan.zhihu.com/p/546502658

最后的结论是:两个独立的变量X和Y,他们乘积的方差/标准差=各自的方差/标准差的乘积。

那么我们把这个结论运用到神经网络隐藏层的神经元运算中

第一个的隐藏层的神经元输出值的方差为输入值的n倍(n是输入层神经元数量),第二隐藏层的方差则是第一隐藏层的n倍,以此类推。

其实通过公式推导我们发现,下一层神经元的输出值方差与三个因素有关,(1)每一层神经元的数量;(2)输入值x的方差;(3)权重矩阵W的方差。

如果想要控制神经元的输出值方差为1,我们可以保证X的方差为1,若想要消掉n,那就是要W的方差为1/n。

我们在代码中加上这一操作

1
2
3
4
5
6
7
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
# 权值初始化,自定义std
nn.init.normal_(
m.weight.data,
std=np.sqrt(1/self.neural_num))

这时候每一层的网络输出值就会是1左右,不会变成无限大了。

Xavier初始化

在上述例子中,我们并未考虑到激活函数,假设我们在forward中添加激活函数再来观察输出。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
x = torch.tanh(x) # 增加激活函数

print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
return x

layer:0, std:0.6318830251693726
layer:1, std:0.4880196452140808
layer:2, std:0.40823400020599365
layer:3, std:0.35683006048202515
layer:4, std:0.32039204239845276
layer:5, std:0.29256343841552734
layer:6, std:0.2630252242088318
layer:7, std:0.24065028131008148
layer:8, std:0.22322414815425873
layer:9, std:0.21111507713794708
layer:10, std:0.20253409445285797
layer:11, std:0.19049973785877228
layer:12, std:0.18213757872581482
layer:13, std:0.17290166020393372
layer:14, std:0.16851326823234558
layer:15, std:0.16633261740207672
layer:16, std:0.16150619089603424
layer:17, std:0.1597711145877838
layer:18, std:0.15324054658412933
layer:19, std:0.14867177605628967
layer:20, std:0.14502786099910736
layer:21, std:0.1448216736316681
layer:22, std:0.14160095155239105
layer:23, std:0.13859443366527557
layer:24, std:0.13385061919689178
layer:25, std:0.1346007138490677
layer:26, std:0.13504254817962646
layer:27, std:0.1350996196269989
layer:28, std:0.1332106590270996
layer:29, std:0.12846721708774567
layer:30, std:0.12505297362804413
layer:31, std:0.12073405832052231
layer:32, std:0.11842075735330582
layer:33, std:0.11658370494842529
layer:34, std:0.11572407186031342
layer:35, std:0.11303886771202087
layer:36, std:0.11429551243782043
layer:37, std:0.11426541954278946
layer:38, std:0.11300574988126755
layer:39, std:0.11152201145887375
layer:40, std:0.10923638194799423
layer:41, std:0.10787928104400635
layer:42, std:0.10703520476818085
layer:43, std:0.10629639029502869
layer:44, std:0.10549120604991913
layer:45, std:0.10611333698034286
layer:46, std:0.10380613058805466
layer:47, std:0.1001860573887825
layer:48, std:0.10112475603818893
layer:49, std:0.10452902317047119
layer:50, std:0.09833698719739914
layer:51, std:0.09790389239788055
layer:52, std:0.09508907794952393
layer:53, std:0.09656200557947159
layer:54, std:0.0947883278131485
layer:55, std:0.09480524808168411
layer:56, std:0.09200789779424667
layer:57, std:0.09230732917785645
layer:58, std:0.08696289360523224
layer:59, std:0.09009458869695663
layer:60, std:0.08880645036697388
layer:61, std:0.08703707903623581
layer:62, std:0.08517660945653915
layer:63, std:0.08439849317073822
layer:64, std:0.08625492453575134
layer:65, std:0.08220728486776352
layer:66, std:0.08385007828474045
layer:67, std:0.08285364508628845
layer:68, std:0.08406919986009598
layer:69, std:0.08500999212265015
layer:70, std:0.08161619305610657
layer:71, std:0.0800199881196022
layer:72, std:0.08104747533798218
layer:73, std:0.0785975530743599
layer:74, std:0.07538190484046936
layer:75, std:0.07322362065315247
layer:76, std:0.07350381463766098
layer:77, std:0.07264978438615799
layer:78, std:0.07152769714593887
layer:79, std:0.07120327651500702
layer:80, std:0.06596919149160385
layer:81, std:0.0668891966342926
layer:82, std:0.06439171731472015
layer:83, std:0.06410669535398483
layer:84, std:0.061894405633211136
layer:85, std:0.06584163010120392
layer:86, std:0.06323011219501495
layer:87, std:0.06158079952001572
layer:88, std:0.06081564351916313
layer:89, std:0.059615038335323334
layer:90, std:0.05910872295498848
layer:91, std:0.05939367786049843
layer:92, std:0.060215458273887634
layer:93, std:0.05801717936992645
layer:94, std:0.05556991696357727
layer:95, std:0.054911285638809204
layer:96, std:0.05629248172044754
layer:97, std:0.0547030083835125
layer:98, std:0.054839838296175
layer:99, std:0.0540759414434433
tensor([[-0.0336, -0.0339, -0.0456, ..., 0.0345, 0.0104, -0.0351],
[ 0.0679, -0.0226, -0.0500, ..., 0.1172, 0.0275, -0.0002],
[-0.0187, -0.0416, 0.0445, ..., -0.0295, 0.0222, -0.0506],
...,
[ 0.0124, -0.0121, 0.0108, ..., 0.0376, 0.0176, 0.0237],
[-0.0248, -0.1184, -0.0842, ..., 0.0893, -0.0364, -0.0314],
[ 0.0041, 0.0016, -0.0335, ..., -0.0084, -0.0525, 0.0149]],
grad_fn=<TanhBackward>)

可以看到随便网络的增加,其输出值越来越小,最终可能会导致梯度的消失。Xavier初始化方法就是针对有激活函数时,网络应该如何初始化,是的每一层网络层的输出值方差为1。

n(i)表示第i层网络神经元的个数,n(i+1)表示第i+1层网络神经元的个数;

权重矩阵W符合均匀分布,其分布的范围是[-a, a],方差等于(下限-上限)的平方除以12,从而最终可以用神经元个数来表示a。

我们下面在代码中来实现这一初始化方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
a = np.sqrt(6 / (self.neural_num+self.neural_num))
# 数据输入到激活函数后,标准差的变化叫做增益
tanh_gain = nn.init.calculate_gain('tanh')
a *= tanh_gain
nn.init.uniform_(m.weight.data, -a, a)


layer:0, std:0.7615973949432373
layer:1, std:0.6943124532699585
layer:2, std:0.6680195331573486
layer:3, std:0.6616125702857971
layer:4, std:0.6516215205192566
layer:5, std:0.6495921015739441
layer:6, std:0.6488803625106812
layer:7, std:0.6540157794952393
layer:8, std:0.646391749382019
layer:9, std:0.6458406448364258
layer:10, std:0.6466742753982544
layer:11, std:0.6506401896476746
layer:12, std:0.6521316170692444
layer:13, std:0.6497370600700378
layer:14, std:0.658663809299469
layer:15, std:0.6492193341255188
layer:16, std:0.6506857872009277
layer:17, std:0.6571136116981506
layer:18, std:0.6543422341346741
layer:19, std:0.6448935270309448
layer:20, std:0.6494017839431763
layer:21, std:0.6532899141311646
layer:22, std:0.6565069556236267
layer:23, std:0.6580100655555725
layer:24, std:0.6577017903327942
layer:25, std:0.6545795202255249
layer:26, std:0.6551402807235718
layer:27, std:0.6509148478507996
layer:28, std:0.6431369185447693
layer:29, std:0.6456488966941833
layer:30, std:0.6511232852935791
layer:31, std:0.6506320834159851
layer:32, std:0.657880425453186
layer:33, std:0.6485406160354614
layer:34, std:0.6527261734008789
layer:35, std:0.6500911712646484
layer:36, std:0.6485082507133484
layer:37, std:0.6504502296447754
layer:38, std:0.6503177285194397
layer:39, std:0.6530241370201111
layer:40, std:0.6510095000267029
layer:41, std:0.6553965210914612
layer:42, std:0.6578318476676941
layer:43, std:0.6548779010772705
layer:44, std:0.6529809236526489
layer:45, std:0.6459652185440063
layer:46, std:0.6443103551864624
layer:47, std:0.6450513601303101
layer:48, std:0.6509076952934265
layer:49, std:0.6491323709487915
layer:50, std:0.6418401598930359
layer:51, std:0.6513242125511169
layer:52, std:0.6482289433479309
layer:53, std:0.6528448462486267
layer:54, std:0.6462175846099854
layer:55, std:0.6517780423164368
layer:56, std:0.6513189077377319
layer:57, std:0.6553127765655518
layer:58, std:0.65123450756073
layer:59, std:0.6539309620857239
layer:60, std:0.6495435237884521
layer:61, std:0.6426352858543396
layer:62, std:0.6444136500358582
layer:63, std:0.6402761936187744
layer:64, std:0.6394422650337219
layer:65, std:0.645153820514679
layer:66, std:0.6502895951271057
layer:67, std:0.6531378030776978
layer:68, std:0.6562566161155701
layer:69, std:0.6443695425987244
layer:70, std:0.6488083600997925
layer:71, std:0.6533599495887756
layer:72, std:0.6547467708587646
layer:73, std:0.6615341901779175
layer:74, std:0.6614145040512085
layer:75, std:0.6613060235977173
layer:76, std:0.660208523273468
layer:77, std:0.6468278765678406
layer:78, std:0.6502286791801453
layer:79, std:0.6533133387565613
layer:80, std:0.6569879055023193
layer:81, std:0.6568872332572937
layer:82, std:0.6558345556259155
layer:83, std:0.6482976675033569
layer:84, std:0.650995671749115
layer:85, std:0.6492160558700562
layer:86, std:0.6520841717720032
layer:87, std:0.6460869908332825
layer:88, std:0.647861123085022
layer:89, std:0.65528404712677
layer:90, std:0.6476141214370728
layer:91, std:0.6491571664810181
layer:92, std:0.6430511474609375
layer:93, std:0.6462271809577942
layer:94, std:0.6526939272880554
layer:95, std:0.6551517248153687
layer:96, std:0.6510483026504517
layer:97, std:0.6543874144554138
layer:98, std:0.6469560265541077
layer:99, std:0.6513620018959045
tensor([[ 0.9357, -0.8044, 0.8998, ..., 0.6191, -0.1190, 0.1825],
[-0.4456, 0.0576, -0.8589, ..., 0.5687, 0.7564, 0.6264],
[ 0.8476, 0.5074, -0.8241, ..., 0.9002, 0.3679, 0.9717],
...,
[-0.6614, -0.4797, -0.5896, ..., 0.0804, -0.5856, -0.1211],
[ 0.8042, 0.8957, 0.8567, ..., -0.0121, 0.1311, -0.9198],
[ 0.1697, -0.4975, -0.8746, ..., 0.1754, -0.4630, -0.6850]],
grad_fn=<TanhBackward>)

现在每一层网络的输出值就不会越来越小了。

pytorch中提供了进行Xavier初始化的方法

1
nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)

Kaiming初始化

Xavier只适合于饱和激活函数,而非饱和激活函数如relu则不适用

假设针对上面的代码,我们把激活函数换成relu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
# 激活函数换成relu
x = torch.relu(x)

print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
return x


layer:0, std:0.9736467599868774
layer:1, std:1.1656721830368042
layer:2, std:1.4657647609710693
layer:3, std:1.7342147827148438
layer:4, std:1.9326763153076172
layer:5, std:2.3239614963531494
layer:6, std:2.8289241790771484
layer:7, std:3.27266263961792
layer:8, std:3.999720573425293
layer:9, std:4.30142068862915
layer:10, std:5.378474235534668
layer:11, std:5.945633411407471
layer:12, std:7.611734867095947
layer:13, std:8.23315143585205
layer:14, std:9.557497024536133
layer:15, std:11.985333442687988
layer:16, std:13.370684623718262
layer:17, std:14.82516860961914
layer:18, std:16.142274856567383
layer:19, std:20.275897979736328
layer:20, std:21.284557342529297
layer:21, std:23.966691970825195
layer:22, std:27.94208526611328
layer:23, std:30.947021484375
layer:24, std:34.0330810546875
layer:25, std:41.57271194458008
layer:26, std:50.857303619384766
layer:27, std:55.795127868652344
layer:28, std:61.40922546386719
layer:29, std:65.00013732910156
layer:30, std:88.00929260253906
layer:31, std:111.04611206054688
layer:32, std:124.12654876708984
layer:33, std:147.9998779296875
layer:34, std:183.37405395507812
layer:35, std:246.94544982910156
layer:36, std:300.00946044921875
layer:37, std:383.9361267089844
layer:38, std:487.6725769042969
layer:39, std:600.6978759765625
layer:40, std:716.6215209960938
layer:41, std:953.6651611328125
layer:42, std:1133.579833984375
layer:43, std:1312.9854736328125
layer:44, std:1593.806396484375
layer:45, std:1823.689208984375
layer:46, std:2410.478515625
layer:47, std:3021.795166015625
layer:48, std:3830.048828125
layer:49, std:4210.138671875
layer:50, std:4821.57373046875
layer:51, std:6131.04248046875
layer:52, std:7420.24853515625
layer:53, std:8933.3251953125
layer:54, std:10330.2490234375
layer:55, std:10835.60546875
layer:56, std:11556.45703125
layer:57, std:14248.263671875
layer:58, std:16818.216796875
layer:59, std:19947.498046875
layer:60, std:26884.26953125
layer:61, std:30824.623046875
layer:62, std:37823.87109375
layer:63, std:42357.39453125
layer:64, std:50901.15234375
layer:65, std:61428.8515625
layer:66, std:81561.578125
layer:67, std:110124.4375
layer:68, std:115554.2265625
layer:69, std:140480.390625
layer:70, std:156277.6875
layer:71, std:188532.859375
layer:72, std:218894.390625
layer:73, std:265436.46875
layer:74, std:280642.125
layer:75, std:317877.53125
layer:76, std:374410.90625
layer:77, std:467014.0
layer:78, std:581226.0625
layer:79, std:748015.75
layer:80, std:1023228.25
layer:81, std:1199279.625
layer:82, std:1498950.875
layer:83, std:1649968.25
layer:84, std:2011411.875
layer:85, std:2473448.25
layer:86, std:2742399.5
layer:87, std:2971178.75
layer:88, std:3524154.5
layer:89, std:3989879.75
layer:90, std:4953882.5
layer:91, std:5743866.5
layer:92, std:6304035.5
layer:93, std:7455818.5
layer:94, std:8783134.0
layer:95, std:11827082.0
layer:96, std:13226204.0
layer:97, std:17922894.0
layer:98, std:19208862.0
layer:99, std:23558832.0
tensor([[45271512., 12699174., 0., ..., 16133147., 70351472.,
50286560.],
[45883780., 13033413., 0., ..., 16280076., 68406784.,
49523924.],
[36918948., 9552099., 0., ..., 13450978., 56660272.,
40777116.],
...,
[50119664., 14681072., 0., ..., 18321080., 74405592.,
54920220.],
[39083596., 11305603., 0., ..., 14831032., 59826176.,
44137924.],
[58225608., 16406362., 0., ..., 21336580., 88127248.,
64016336.]], grad_fn=<ReluBackward0>)

可以看到网络的输出值不断增大,有可能会导致梯度爆炸。

Kaiming初始化方法则是用来解决非激活函数。

其中a是针对relu的变种其负半轴的斜率。

针对上述代码,我们改用Kaiming初始化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Kaiming初始化
nn.init.normal_(m.weight.data, std=np.sqrt(2/self.neural_num))

layer:0, std:0.8256813883781433
layer:1, std:0.8159489631652832
layer:2, std:0.8242867588996887
layer:3, std:0.7728303074836731
layer:4, std:0.7587776780128479
layer:5, std:0.7840063571929932
layer:6, std:0.812754213809967
layer:7, std:0.8468071222305298
layer:8, std:0.750910222530365
layer:9, std:0.7862679958343506
layer:10, std:0.8471992015838623
layer:11, std:0.8358661532402039
layer:12, std:0.8365052938461304
layer:13, std:0.941089391708374
layer:14, std:0.8334775567054749
layer:15, std:0.7378053665161133
layer:16, std:0.7786925435066223
layer:17, std:0.7801215052604675
layer:18, std:0.744530975818634
layer:19, std:0.7729376554489136
layer:20, std:0.7996224164962769
layer:21, std:0.7438898682594299
layer:22, std:0.8052859306335449
layer:23, std:0.8013993501663208
layer:24, std:0.8458987474441528
layer:25, std:0.9109481573104858
layer:26, std:0.8329163789749146
layer:27, std:0.7938662171363831
layer:28, std:0.7818281650543213
layer:29, std:0.7433196306228638
layer:30, std:0.8786559700965881
layer:31, std:0.8984055519104004
layer:32, std:0.7985660433769226
layer:33, std:0.9004362225532532
layer:34, std:0.8771427273750305
layer:35, std:0.8327329158782959
layer:36, std:0.7529121041297913
layer:37, std:0.7560306191444397
layer:38, std:0.7768694758415222
layer:39, std:0.7229365706443787
layer:40, std:0.6847150921821594
layer:41, std:0.6917332410812378
layer:42, std:0.7432793974876404
layer:43, std:0.7119105458259583
layer:44, std:0.723327100276947
layer:45, std:0.692399263381958
layer:46, std:0.7096500992774963
layer:47, std:0.7403143644332886
layer:48, std:0.7733916640281677
layer:49, std:0.7756217122077942
layer:50, std:0.8285183906555176
layer:51, std:0.811735987663269
layer:52, std:0.766518771648407
layer:53, std:0.8102941513061523
layer:54, std:0.7746390104293823
layer:55, std:0.8069944977760315
layer:56, std:0.8859500288963318
layer:57, std:0.8730546236038208
layer:58, std:0.8584580421447754
layer:59, std:0.8817113041877747
layer:60, std:0.8609682321548462
layer:61, std:0.6981067657470703
layer:62, std:0.6881256103515625
layer:63, std:0.7074345350265503
layer:64, std:0.8192773461341858
layer:65, std:0.7355524301528931
layer:66, std:0.7250872254371643
layer:67, std:0.7580576539039612
layer:68, std:0.6964988112449646
layer:69, std:0.752277135848999
layer:70, std:0.7253941893577576
layer:71, std:0.6531673073768616
layer:72, std:0.6727007627487183
layer:73, std:0.6199373006820679
layer:74, std:0.5950634479522705
layer:75, std:0.61208176612854
layer:76, std:0.6338782906532288
layer:77, std:0.6784136891365051
layer:78, std:0.6899406313896179
layer:79, std:0.7904988527297974
layer:80, std:0.74302738904953
layer:81, std:0.8214826583862305
layer:82, std:0.9201915264129639
layer:83, std:0.8273651599884033
layer:84, std:0.8774834275245667
layer:85, std:0.7430731654167175
layer:86, std:0.8204286694526672
layer:87, std:0.7464808821678162
layer:88, std:0.7037572860717773
layer:89, std:0.7689121961593628
layer:90, std:0.6902880668640137
layer:91, std:0.68663489818573
layer:92, std:0.6811012029647827
layer:93, std:0.7253351807594299
layer:94, std:0.7396165132522583
layer:95, std:0.786566436290741
layer:96, std:0.8232990503311157
layer:97, std:0.8759231567382812
layer:98, std:0.8548166155815125
layer:99, std:0.8607176542282104
tensor([[0.0000, 0.0000, 1.0538, ..., 0.9840, 0.0000, 0.0000],
[0.0000, 0.0000, 0.9070, ..., 0.8981, 0.0000, 0.0000],
[0.0000, 0.0000, 1.0179, ..., 1.0203, 0.0000, 0.0000],
...,
[0.0000, 0.0000, 0.7957, ..., 0.6951, 0.0000, 0.0000],
[0.0000, 0.0000, 0.9696, ..., 0.8639, 0.0000, 0.0000],
[0.0000, 0.0000, 0.9062, ..., 0.8373, 0.0000, 0.0000]],
grad_fn=<ReluBackward0>)

pytorch中同样有直接Kaiming初始化的方法

1
nn.init.kaiming_normal_(m.weight.data)

nn.init.calculate_gain

主要功能:计算激活函数的方差变化尺度

主要参数:

  • nonlinearity:激活函数名称
  • param:激活函数的参数,如Leaky Relu的negative_slop
1
2
3
4
5
6
7
8
9
10
11
x = torch.randn(10000)
out = torch.tanh(x)

gain = x.std() / out.std()
print('gain:{}'.format(gain))

tanh_gain = nn.init.calculate_gain('tanh')
print('tanh_gain in PyTorch:', tanh_gain)

gain:1.5917036533355713
tanh_gain in PyTorch: 1.6666666666666667

也就是说数据经过tanh之后,其标准差会减小1.6倍左右。

损失函数

损失函数概念

衡量模型输出与真实标签的差异

损失函数一般指单个样本,而代价函数则是所有样本,目标函数既包括代价函数(尽可能的小),也包括一个正则项,也就是避免过拟合。

pytorch中的loss

size_average和reduce参数即将废弃,不要使用。

nn.CrossEntropyLoss

功能:nn.LogSoftmax()nn.NLLLoss()结合,进行交叉熵计算

参数:

  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean

none:逐元素计算

sum:所有元素求和,返回标量

mean:加权平均,返回标量

P表示训练集标签,Q表示预测值标签;

熵就是对自信息求期望。H(P)表示训练集标签的熵,其为定值。

这里x是神经元的输出,class表示该输出所对应的类别。

第一个式子是未设置weight参数的形式,第二个式子是设置了weight的形式。

Pytorch代码(未设置weight)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import torch
import torch.nn as nn
import numpy as np


inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

loss_f_none = nn.CrossEntropyLoss(weight=None, reduction='none')
loss_f_sum = nn.CrossEntropyLoss(weight=None, reduction='sum')
loss_f_mean = nn.CrossEntropyLoss(weight=None, reduction='mean')

# forward
loss_none = loss_f_none(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)

# view
print("Cross Entropy Loss:\n", loss_none, loss_sum, loss_mean)

# compute by hand
idx = 0
input_1 = inputs.detach().numpy()[idx] # [1, 2]
target_1 = target.numpy()[idx] # 0

# 第一项
x_class = input_1[target_1]

# 第二项
sigma_exp_x = np.sum(list(map(np.exp, input_1)))
log_sigma_exp_x = np.log(sigma_exp_x)

# 输出loss
loss_1 = -x_class + log_sigma_exp_x
print("第一个样本的loss:{}".format(loss_1))


Cross Entropy Loss:
tensor([1.3133, 0.1269, 0.1269]) tensor(1.5671) tensor(0.5224)
第一个样本的loss:1.3132617473602295

Pytorch代码(设置weight)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
weights = torch.tensor([1, 2], dtype=torch.float)

loss_f_none_w = nn.CrossEntropyLoss(weight=weights, reduction='none')
loss_f_sum_w = nn.CrossEntropyLoss(weight=weights, reduction='sum')
loss_f_mean_w = nn.CrossEntropyLoss(weight=weights, reduction='mean')

# forward
loss_none_w = loss_f_none_w(inputs, target)
loss_sum_w = loss_f_sum_w(inputs, target)
loss_mean_w = loss_f_mean_w(inputs, target)

# view
print("\nweights: ", weights)
print(loss_none_w, loss_sum_w, loss_mean_w)

weights: tensor([1., 2.])
tensor([1.3133, 0.2539, 0.2539]) tensor(1.8210) tensor(0.3642)

因为给不同的类别设置了权重,这里类别0的权重为1,类别1的权重设置为2,所以针对第一个输出值的类别为0,其损失乘以1,依然是1.3133;第二、三个输出值的类别为1,其损失乘以2。

当reduction=’sum’时,就是把这几些损失加在一起;

当reduction=’mean’时,求和之后除以总份数(1+2+2=5)。

如果将weight改为[0.7, 0.3],输出则为

1
2
3
weights:  tensor([0.7000, 0.3000])
tensor([0.9193, 0.0381, 0.0381]) tensor(0.9954) tensor(0.7657)
# 0.9954/(0.7+0.3+0.3) = 0.7657

对于none和sum的形式,weight是比较好理解的,对于mean的形式,我们来手动计算一下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# compute by hand
weights = torch.tensor([1, 2], dtype=torch.float)
# 按照权重计算总共有多少份 target[0, 1, 1] --> sum([1, 2, 2]) = 5
weights_all = np.sum(list(map(lambda x: weights.numpy()[x], target.numpy())))

mean = 0
# 借助之前的loss_none:tensor([1.3133, 0.2539, 0.2539])
loss_sep = loss_none.detach().numpy()
for i in range(target.shape[0]):
x_class = target.numpy()[i]
tmp = loss_sep[i] * (weights.numpy()[x_class]/weights_all)
mean += tmp

print(mean)

0.3641947731375694

nn.NLLLoss

功能:实现负对数似然函数中的负号功能

主要参数:

  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

# NLLLoss
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)

weights = torch.tensor([1, 1], dtype=torch.float)
loss_f_none_w = nn.NLLLoss(weight=weights, reduction='none')
loss_f_sum_w = nn.NLLLoss(weight=weights, reduction='sum')
loss_f_mean_w = nn.NLLLoss(weight=weights, reduction='mean')

loss_none_w = loss_f_none_w(inputs, target)
loss_sum_w = loss_f_sum_w(inputs, target)
loss_mean_w = loss_f_mean_w(inputs, target)

# view
print("\n weights:", weights)
print("NLL Loss", loss_none_w, loss_sum_w, loss_mean_w)

weights: tensor([1., 1.])
NLL Loss tensor([-1., -3., -3.]) tensor(-7.) tensor(-2.3333)

这里神经网络的输出分别为x1=[1, 2]、x2=[1, 3]、x3=[1, 3],其中第一个输出对应类别为0,所以loss是对其x1[0]取反,得到-1;同理第二、三个输出对应类别为1,所以loss是对其x2[1]x3[1]取反,得到-3。

注意:这里的只有两个类别0和1,所以输出的x长度也是为2,这是互相对应的,如果有三类,则输出x的长度为3。

这里x1、x2、x3可以看作是不同样本输出,而x1内部的[1,2]可以看作是不同神经元的输出。

nn.BCELoss

功能:二分类交叉熵,输入值取值在[0,1]

主要参数:

  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# BCELoss
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
# 这里计算Loss,是计算每个神经元的loss,而不是整个样本的loss
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

target_bce = target
# 输入值范围为[0, 1]
inputs = torch.sigmoid(inputs)
weights = torch.tensor([1, 1])

loss_f_none_w = nn.BCELoss(weight=weights, reduction='none')
loss_f_sum_w = nn.BCELoss(weight=weights, reduction='sum')
loss_f_mean_w = nn.BCELoss(weight=weights, reduction='mean')

loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum_w = loss_f_sum_w(inputs, target_bce)
loss_mean_w = loss_f_mean_w(inputs, target_bce)

print("\n weights:", weights)
print("BCELoss Loss", loss_none_w, loss_sum_w, loss_mean_w)

weights: tensor([1, 1])
BCELoss Loss tensor([[0.3133, 2.1269],
[0.1269, 2.1269],
[3.0486, 0.0181],
[4.0181, 0.0067]]) tensor(11.7856) tensor(1.4732)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# compute by hand
idx = 0
x_i = inputs.detach().numpy()[idx, idx]
y_i = target.numpy()[idx, idx]

l_i = -y_i * np.log(x_i) if y_i else -(1-y_i) * np.log(1 - x_i)
print("BCE inputs:", inputs)
print("第一个Loss为:", l_i)

BCE inputs: tensor([[0.7311, 0.8808],
[0.8808, 0.8808],
[0.9526, 0.9820],
[0.9820, 0.9933]])
第一个Loss为: 0.31326166

nn.BCEWithLogitsLoss

功能:结合Sigmoid与二分类交叉熵

注意事项:网络最后不加sigmoid函数

主要参数:

  • pos_weight:正样本的权值(正样本的数量乘以pos_weight)
  • weight:各类别的loss设置权值
  • ignore_index:忽略某个类别
  • reduction:计算模式,可为none/sum/mean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# BCE with logis Loss
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

target_bce = target
weights = torch.tensor([1, 1], dtype=torch.float)

loss_f_none_w = nn.BCEWithLogitsLoss(weight=weights, reduction='none')
loss_f_sum_w = nn.BCEWithLogitsLoss(weight=weights, reduction='sum')
loss_f_mean_w = nn.BCEWithLogitsLoss(weight=weights, reduction='mean')

loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum_w = loss_f_sum_w(inputs, target_bce)
loss_mean_w = loss_f_mean_w(inputs, target_bce)

# view
print("\nweight: ", weights)
print(loss_none_w, loss_sum_w, loss_mean_w)

weight: tensor([1., 1.])
tensor([[0.3133, 2.1269],
[0.1269, 2.1269],
[3.0486, 0.0181],
[4.0181, 0.0067]]) tensor(11.7856) tensor(1.4732)

可以看到这里与BCELoss相比,输入并没有进行sigmoid,最后计算的损失是与BCELoss一致的。

下面是关于pos_weight参数的理解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# BCE with logis Loss
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)

target_bce = target
weights = torch.tensor([1, 1], dtype=torch.float)
pos_w = torch.tensor([3], dtype=torch.float)

loss_f_none_w = nn.BCEWithLogitsLoss(weight=weights, reduction='none', pos_weight=pos_w)
loss_f_sum_w = nn.BCEWithLogitsLoss(weight=weights, reduction='sum', pos_weight=pos_w)
loss_f_mean_w = nn.BCEWithLogitsLoss(weight=weights, reduction='mean', pos_weight=pos_w)

loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum_w = loss_f_sum_w(inputs, target_bce)
loss_mean_w = loss_f_mean_w(inputs, target_bce)

# view
print("\nweight: ", weights)
print("pos_weights: ", pos_w)
print(loss_none_w, loss_sum_w, loss_mean_w)

weight: tensor([1., 1.])
pos_weights: tensor([3.])
tensor([[0.9398, 2.1269],
[0.3808, 2.1269],
[3.0486, 0.0544],
[4.0181, 0.0201]]) tensor(12.7158) tensor(1.5895)

这里第1个样本、第2个样本的第1个神经元,第3个样本、第4个样本的第2个神经元的输出类别为1(正样本),所以其损失值会乘以pos_weight的值,即0.3133 * 3 = 0.9398

nn.L1Loss

功能:计算inputs与target之差的绝对值

nn.MSELoss

功能:计算inputs与target之差的平方

nn.SmoothL1Loss

功能:平滑的L1Loss

nn.PoissonNLLLoss

功能:泊松分布的负对数似然损失函数

nn.KLDivLoss

nn.MarginRankingLoss

功能:计算两个向量之间的相似度,用于排序任务

特别说明:该方法计算两组数据之间的差异,返回一个n*n的loss矩阵

主要参数:

  • margin:边界值,x1与x2之间的差异值
  • reduction:计算模式,可为none/sum/mean

y=1时,希望x1x2大,当x1>x2时,不产生loss

y=-1时,希望x2x1大,当x2>x1时,不产生loss

1
2
3
4
5
6
7
8
9
10
11
12
13
# Margin Ranking Loss

x1 = torch.tensor([[1], [2], [3]], dtype=torch.float)
x2 = torch.tensor([[2], [2], [2]], dtype=torch.float)

target = torch.tensor([1, 1, -1], dtype=torch.float)
loss_f_none = nn.MarginRankingLoss(margin=0, reduction='none')
loss = loss_f_none(x1, x2, target)
print(loss)

tensor([[1., 1., 0.],
[0., 0., 0.],
[0., 0., 1.]])

这里损失函数分别计算x1每个元素与x2所有元素之间的损失,所以得到一个3x3的矩阵,当x1的第一个元素[1]与x2计算损失时,y分别为[1, 1, -1],即1<2,与期望的x1>x2不符,损失=2-1=1,与期望x1<x2相符时,则不产生损失(损失为0)。

nn.MultiLabelMarginLoss

多标签:一个样本可能对应多个类别

功能:多标签边界损失函数

nn.SoftMarginLoss

功能:计算二分类的logistic损失

1
2
3
4
5
6
7
8
9
10
11
# SoftMargin Loss

inputs = torch.tensor([[0.3, 0.7], [0.5, 0.5]], dtype=torch.float)
target = torch.tensor([[-1, 1], [1, -1]], dtype=torch.float)

loss_f_none = nn.SoftMarginLoss(reduction='none')
loss = loss_f_none(inputs, target)
print("SoftMargin: ", loss)

SoftMargin: tensor([[0.8544, 0.4032],
[0.4741, 0.9741]])

nn.MultiLabelSoftMarginLoss

功能:SoftMarginLoss多标签版本

nn.MultiMarginLoss

功能:计算多分类的折页损失

nn.TripletMarginLoss

功能:计算三元组损失,人脸验证中常用

主要是为了使anchor与positive离得更近,anchor与negative离得更远。

1
2
3
4
5
6
7
8
9
10
11
# Triplet Margin Loss
anchor = torch.tensor([[1.]])
pos = torch.tensor([[2.]])
neg = torch.tensor([[0.5]])

loss_f = nn.TripletMarginLoss(margin=1.0, p=1)
loss = loss_f(anchor, pos, neg)
print("Triplet Margin Loss", loss)

# dap=1, dan=0.5, loss = max(1-0.5+1, 0) = 1.5
Triplet Margin Loss tensor(1.5000)

nn.HingeEmbeddingLoss

功能:计算两个输入的相似性,常用于非线性embedding和半监督学习

特别注意:输入x应为两个输入之差的绝对值

1
2
3
4
5
6
7
8
# Hinge Embedding Loss
inputs = torch.tensor([[1., 0.8, 0.5]])
target = torch.tensor([[1, 1, -1]])
loss_f = nn.HingeEmbeddingLoss(margin=1, reduction='none')
loss = loss_f(inputs, target)
print("Hinge Embedding Loss: ", loss)

Hinge Embedding Loss: tensor([[1.0000, 0.8000, 0.5000]])

三角号代表margin参数

nn.CosineEmbeddingLoss

功能:采用余弦相似度计算两个输入的相似性

主要参数:

  • margin:可取值[-1, 1],推荐为[0, 0.5]
  • reduction:计算模式,可为none/sum/mean

1
2
3
4
5
6
7
8
9
10
11
# Cosine Embedding Loss
x1 = torch.tensor([[0.3, 0.5, 0.7], [0.3, 0.5, 0.7]])
x2 = torch.tensor([[0.1, 0.3, 0.5], [0.1, 0.3, 0.5]])
target = torch.tensor([1, -1], dtype=torch.float)

loss_f = nn.CosineEmbeddingLoss(margin=0, reduction='none')

loss = loss_f(x1, x2, target)
print(loss)

tensor([0.0167, 0.9833])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# compute by hand
margin = 0


def cosine(a, b):
numerator = torch.dot(a, b)
denominator = torch.norm(a, 2) * torch.norm(b, 2)
return float(numerator/denominator)


l_1 = 1 - (cosine(x1[0], x2[0]))
l_2 = max(0, cosine(x1[0], x2[0]) - margin)
print(l_1, l_2)

0.016662120819091797 0.9833378791809082

nn.CTCLoss

功能:计算CTC(Connectionist Temporal Classification)损失,解决时序类数据的分类(比如OCR)。

主要参数:

  • blankblank label
  • zero_infinity:无穷大的值或梯度置0
  • reduction:计算模式,可为none/sum/mean