构建文本摘要Baseline并且开始训练

构建文本摘要Baseline并且开始训练

基于前面word2vec的原理与训练实践、seq2seq模型的原理与实践以及attention机制,已经分别写了相关的文章来记录,此篇文章就是基于前面所学,开始着手训练文本摘要模型,当然仅是一个比较普通的baseline,后面还会不断优化模型。

构建seq2seq模型

首先利用上一节seq2seq实践中,封装的encoder、decoder和attention,集成到此模型中来,另外就是增加了一个训练技巧–teacher forcing。那么teacher forcing是啥意思呢?

seq2seq模型的输出为decoder解码出的一系列概率分布,因此采用何种方式进行解码,就显得尤为重要。如贪心解码(greedy search)teacher forcing以及介于两种之间的beam search等。
    贪心解码的思想是,预测 t 时刻输出的单词时,直接将t−1时刻的输出词汇表中概率最大的单词,作为t时刻的输入,因此可能导致如果前一个预测值就不准的话,后面一系列都不准的问题
    Teacher Forcing的方法是,预测 t时刻输出的单词时,直接将t−1时刻的实际单词,作为输入,因此可能带来的问题是,训练过程预测良好(因为有标签,即实际单词),但是测试过程极差(因为测试过程不会给对应的真实单词)
    实际应用中,往往采用介于这两种极端方式之间的解码方式,如beam search 等,具体思路是预测 t 时刻输出的单词时,保留t−1时刻的输出词汇表中概率最大的前K个单词,以此带来更多的可能性(解决第一个方法的缺陷);而且在训练过程,采用一定的概率P,来决定是否使用真实单词作为输入(解决第二个方法的缺陷)。greedy search 和beam search后面我们也会一一介绍,下面是teacher forcing的具体实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import tensorflow as tf

from src.seq2seq_tf2.model_layers import Encoder, BahdanauAttention, Decoder
from src.utils.gpu_utils import config_gpu
from src.utils.params_utils import get_params
from src.utils.wv_loader import load_embedding_matrix, Vocab


class Seq2Seq(tf.keras.Model):
def __init__(self, params, vocab):
super(Seq2Seq, self).__init__()
self.embedding_matrix = load_embedding_matrix()
self.params = params
self.vocab = vocab
self.batch_size = params["batch_size"]
self.enc_units = params["enc_units"]
self.dec_units = params["dec_units"]
self.attn_units = params["attn_units"]
self.encoder = Encoder(self.embedding_matrix,
self.enc_units,
self.batch_size)

self.attention = BahdanauAttention(self.attn_units)

self.decoder = Decoder(self.embedding_matrix,
self.dec_units,
self.batch_size)

def teacher_decoder(self, dec_hidden, enc_output, dec_target):
predictions = []

# 第一个输入<START>
dec_input = tf.expand_dims([self.vocab.START_DECODING_INDEX] * self.batch_size, 1)

# Teacher forcing 将target作为下一次的输入,依次解码
for t in range(1, dec_target.shape[1]):
# passing enc_output to the decoder
# 应用decoder来一步一步预测生成词语概论分布
pred, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
dec_input = tf.expand_dims(dec_target[:, t], 1)

predictions.append(pred)

return tf.stack(predictions, 1), dec_hidden

开始训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import tensorflow as tf

from src.seq2seq_tf2.seq2seq_model import Seq2Seq
from src.seq2seq_tf2.train_helper import train_model
from src.utils.gpu_utils import config_gpu
from src.utils.params_utils import get_params
from src.utils.wv_loader import Vocab


def train(params):
# GPU资源配置
config_gpu(use_cpu=True)

# 读取vocab训练
vocab = Vocab(params["vocab_path"], params["vocab_size"])

params['vocab_size'] = vocab.count

# 构建模型
print("Building the model ...")
model = Seq2Seq(params, vocab)

# 获取保存管理者
checkpoint = tf.train.Checkpoint(Seq2Seq=model)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5)

# 训练模型
train_model(model, vocab, params, checkpoint_manager)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import tensorflow as tf

# from src.pgn_tf2.batcher import batcher
from src.seq2seq_tf2.seq2seq_batcher import train_batch_generator
import time
from functools import partial


def train_model(model, vocab, params, checkpoint_manager):
epochs = params['epochs']

pad_index = vocab.word2id[vocab.PAD_TOKEN]

# 获取vocab大小
params['vocab_size'] = vocab.count

optimizer = tf.keras.optimizers.Adam(name='Adam', learning_rate=params['learning_rate'])

train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch = train_batch_generator(
params['batch_size'], params['max_enc_len'], params['max_dec_len'], params['buffer_size']
)

for epoch in range(epochs):
start = time.time()
enc_hidden = model.encoder.initialize_hidden_state()

total_loss = 0.
running_loss = 0.
for (batch, (inputs, target)) in enumerate(train_dataset.take(train_steps_per_epoch), start=1):

batch_loss = train_step(model, inputs, target, enc_hidden,
loss_function=partial(loss_function, pad_index=pad_index),
optimizer=optimizer)
total_loss += batch_loss

if batch % 50 == 0:
print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
batch,
(total_loss - running_loss) / 50))
running_loss = total_loss
# saving (checkpoint) the model every 2 epochs
if (epoch + 1) % 2 == 0:
ckpt_save_path = checkpoint_manager.save()
print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
ckpt_save_path))

valid_loss = evaluate(model, val_dataset, val_steps_per_epoch,
loss_func=partial(loss_function, pad_index=pad_index))

print('Epoch {} Loss {:.4f}; val Loss {:.4f}'.format(
epoch + 1, total_loss / train_steps_per_epoch, valid_loss)
)

print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


# 定义损失函数
def loss_function(real, pred, pad_index):
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
mask = tf.math.logical_not(tf.math.equal(real, pad_index))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)


def train_step(model, enc_inp, dec_target, enc_hidden, loss_function=None, optimizer=None, mode='train'):
with tf.GradientTape() as tape:

enc_output, enc_hidden = model.encoder(enc_inp, enc_hidden)
# 第一个隐藏层输入
dec_hidden = enc_hidden

# 逐个预测序列
predictions, _ = model.teacher_decoder(dec_hidden, enc_output, dec_target)

batch_loss = loss_function(dec_target[:, 1:], predictions)

if mode == 'train':
variables = (model.encoder.trainable_variables + model.decoder.trainable_variables
+ model.attention.trainable_variables)

gradients = tape.gradient(batch_loss, variables)

gradients, _ = tf.clip_by_global_norm(gradients, 1.0)

optimizer.apply_gradients(zip(gradients, variables))

return batch_loss


def evaluate(model, val_dataset, val_steps_per_epoch, loss_func):
print('Starting evaluate ...')
total_loss = 0.
enc_hidden = model.encoder.initialize_hidden_state()
for (batch, (inputs, target)) in enumerate(val_dataset.take(val_steps_per_epoch), start=1):
batch_loss = train_step(model, inputs, target, enc_hidden,
loss_function=loss_func, mode='val')
total_loss += batch_loss
return total_loss / val_steps_per_epoch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from src.build_data.data_loader import load_dataset 
import tensorflow as tf
from src.utils import config
from tqdm import tqdm


def train_batch_generator(batch_size, max_enc_len=200, max_dec_len=50, buffer_size=5, sample_sum=None):
# 加载数据集
train_X, train_Y = load_dataset(config.train_x_path, config.train_y_path,
max_enc_len, max_dec_len)
val_X, val_Y = load_dataset(config.test_x_path, config.test_y_path,
max_enc_len, max_dec_len)
if sample_sum:
train_X = train_X[:sample_sum]
train_Y = train_Y[:sample_sum]
print(f'total {len(train_Y)} examples ...')
train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_Y)).shuffle(len(train_X),
reshuffle_each_iteration=True)
val_dataset = tf.data.Dataset.from_tensor_slices((val_X, val_Y)).shuffle(len(val_X),
reshuffle_each_iteration=True)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True).prefetch(buffer_size)
val_dataset = val_dataset.batch(batch_size, drop_remainder=True).prefetch(buffer_size)
train_steps_per_epoch = len(train_X) // batch_size
val_steps_per_epoch = len(val_X) // batch_size
return train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch
1
2
3
4
5
6
7
8
9
10
11
def load_dataset(x_path, y_path, max_enc_len, max_dec_len, sample_sum=None):
x = np.load(x_path+".npy")
y = np.load(y_path+".npy")

if sample_sum:
x = x[:sample_sum, :max_enc_len]
y = y[:sample_sum, :max_dec_len]
else:
x = x[:, :max_enc_len]
y = y[:, :max_dec_len]
return x, y