import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import math, copy, time from torch.autograd import Variable import matplotlib.pyplot as plt import seaborn seaborn.set_context(context="talk") %matplotlib inline
defclones(module, N): "Produce N identical layers." return nn.ModuleList([copy.deepcopy(module) for _ inrange(N)])
classEncoder(nn.Module): "Core encoder is a stack of N layers" def__init__(self, layer, N): super(Encoder, self).__init__() self.layers = clones(layer, N) self.norm = LayerNorm(layer.size) defforward(self, x, mask): "Pass the input (and mask) through each layer in turn." for layer in self.layers: x = layer(x, mask) return self.norm(x)
classSublayerConnection(nn.Module): """ A residual connection followed by a layer norm. Note for code simplicity the norm is first as opposed to last. """ def__init__(self, size, dropout): super(SublayerConnection, self).__init__() self.norm = LayerNorm(size) self.dropout = nn.Dropout(dropout)
defforward(self, x, sublayer): "Apply residual connection to any sublayer with the same size." return x + self.dropout(sublayer(self.norm(x)))
classPositionalEncoding(nn.Module): "Implement the PE function." def__init__(self, d_model, dropout, max_len=5000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) # Compute the positional encodings once in log space. pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.register_buffer('pe', pe) defforward(self, x): x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False) return self.dropout(x)
如下图,位置编码将根据位置添加正弦波。波的频率和偏移对于每个维度都是不同的。
plt.figure(figsize=(15, 5)) pe = PositionalEncoding(20, 0) y = pe.forward(Variable(torch.zeros(1, 100, 20))) plt.plot(np.arange(100), y[0, :, 4:8].data.numpy()) plt.legend(["dim %d"%p for p in [4,5,6,7]]) None
defmake_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
# Small example model. tmp_model = make_model(10, 10, 2) None
我们使用KL div损失实现标签平滑。我们没有使用one-hot独热分布,而是创建了一个分布,we create a distribution that has confidence of the correct word and the rest of the smoothing mass distributed throughout the vocabulary。该分布具有对正确单词的“置信度”和分布在整个词汇表中的“平滑”质量的其余部分。(这句后半段不会啊)
Batching matters a ton for speed. We want to have very evenly divided batches, with absolutely minimal padding. To do this we have to hack a bit around the default torchtext batching. This code patches their default batching to make sure we search over enough sentences to find tight batches.
classMyIterator(data.Iterator): defcreate_batches(self): if self.train: defpool(d, random_shuffler): for p in data.batch(d, self.batch_size * 100): p_batch = data.batch( sorted(p, key=self.sort_key), self.batch_size, self.batch_size_fn) for b in random_shuffler(list(p_batch)): yield b self.batches = pool(self.data(), self.random_shuffler) else: self.batches = [] for b in data.batch(self.data(), self.batch_size, self.batch_size_fn): self.batches.append(sorted(b, key=self.sort_key))
defrebatch(pad_idx, batch): "Fix order in torchtext to match ours" src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1) return Batch(src, trg, pad_idx)
多GPU训练(这一段需要检查,中英文都有)
Finally to really target fast training, we will use multi-gpu. This code implements multi-gpu word generation. It is not specific to transformer so I won’t go into too much detail. The idea is to split up word generation at training time into chunks to be processed in parallel across many different gpus. We do this using pytorch parallel primitives:
parallel_apply - apply module to batches on different gpus
gather - pull scattered data back onto one gpu.
nn.DataParallel - a special module wrapper that calls these all before evaluating.
replicate -将模块拆分到不同的 GPU 上。
scatter -将批次拆分到不同的 GPU 上。
parallel_apply - 将模块应用于不同 GPU 上的批次
gather - 将分散的数据拉回到一个 GPU 上。
nn.DataParallel - 一个特殊的模块包装器,在评估之前调用以上所有这些参数。
# Skip if not interested in multigpu. classMultiGPULossCompute: "A multi-gpu loss compute and train function." def__init__(self, generator, criterion, devices, opt=None, chunk_size=5): # Send out to different gpus. self.generator = generator self.criterion = nn.parallel.replicate(criterion, devices=devices) self.opt = opt self.devices = devices self.chunk_size = chunk_size def__call__(self, out, targets, normalize): total = 0.0 generator = nn.parallel.replicate(self.generator, devices=self.devices) out_scatter = nn.parallel.scatter(out, target_gpus=self.devices) out_grad = [[] for _ in out_scatter] targets = nn.parallel.scatter(targets, target_gpus=self.devices)
# Divide generating into chunks. chunk_size = self.chunk_size for i inrange(0, out_scatter[0].size(1), chunk_size): # Predict distributions out_column = [[Variable(o[:, i:i+chunk_size].data, requires_grad=self.opt isnotNone)] for o in out_scatter] gen = nn.parallel.parallel_apply(generator, out_column)
# Compute loss. y = [(g.contiguous().view(-1, g.size(-1)), t[:, i:i+chunk_size].contiguous().view(-1)) for g, t inzip(gen, targets)] loss = nn.parallel.parallel_apply(self.criterion, y)
# Sum and normalize loss l = nn.parallel.gather(loss, target_device=self.devices[0]) l = l.sum()[0] / normalize total += l.data[0]
# Backprop loss to output of transformer if self.opt isnotNone: l.backward() for j, l inenumerate(loss): out_grad[j].append(out_column[j][0].grad.data.clone())
# Backprop all loss through transformer. if self.opt isnotNone: out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad] o1 = out o2 = nn.parallel.gather(out_grad, target_device=self.devices[0]) o1.backward(gradient=o2) self.opt.step() self.opt.optimizer.zero_grad() return total * normalize
Now we create our model, criterion, optimizer, data iterators, and paralelization。 现在我们创建我们的模型、criterion、优化器、数据迭代器和并行化。
Now we train the model. I will play with the warmup steps a bit, but everything else uses the default parameters. On an AWS p3.8xlarge with 4 Tesla V100s, this runs at ~27,000 tokens per second with a batch size of 12,000
ifFalse: model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch inrange(10): model_par.train() run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt)) model_par.eval() loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None)) print(loss) else: model = torch.load("iwslt.pt")
for i, batch inenumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=TGT.vocab.stoi["<s>"]) print("Translation:", end="\t") for i inrange(1, out.size(1)): sym = TGT.vocab.itos[out[0, i]] if sym == "</s>": break print(sym, end =" ") print() print("Target:", end="\t") for i inrange(1, batch.trg.size(0)): sym = TGT.vocab.itos[batch.trg.data[i, 0]] if sym == "</s>": break print(sym, end =" ") print() break
Translation: <unk> <unk> . In my language , that means , thank you very much .
Target: <unk> <unk> . It means in my language , thank you very much .
Additional Components: BPE, Search, Averaging
附加组件:BPE、搜索、平均
So this mostly covers the transformer model itself. There are four aspects that we didn’t cover explicitly. We also have all these additional features implemented in OpenNMT-py. 以上内容主要涵盖了transformer模型本身,但其实还有四个附加功能我们没有涉及。不过我们在OpenNMT-py中实现了所有这些附加功能。
1) BPE/ Word-piece: We can use a library to first preprocess the data into subword units. See Rico Sennrich’s subword-nmt implementation. These models will transform the training data to look like this: 1) BPE/Word-piece:我们可以使用一个库首先将数据预处理为子词单元。可以参见Rico Sennrich的subword-nmt 来实现。这些模型会将训练数据转换为如下所示:
▁Die ▁Protokoll datei ▁kann ▁ heimlich ▁per ▁E - Mail ▁oder ▁FTP ▁an ▁einen ▁bestimmte n ▁Empfänger ▁gesendet ▁werden .
2) Shared Embeddings: When using BPE with shared vocabulary we can share the same weight vectors between the source / target / generator. See the (cite) for details. To add this to the model simply do this: 2) Embeddings共享:当使用带有共享词汇的BPE时,我们可以在源/目标/生成器之间共享相同的权重向量。详细信息可参考(cite)。要将其添加到模型中,只需执行以下操作:
4) Model Averaging:The paper averages the last k checkpoints to create an ensembling effect. We can do this after the fact if we have a bunch of models: 4) Model Averaging:论文对最后k个检查点进行平均以得到集成效果。如果我们有一堆模型,我们可以这样做:
defaverage(model, models): "Average models into model" for ps inzip(*[m.params() for m in [model] + models]): p[0].copy_(torch.sum(*ps[1:]) / len(ps[1:]))
The code we have written here is a version of the base model. There are fully trained version of this system available here (Example Models).
With the addtional extensions in the last section, the OpenNMT-py replication gets to 26.9 on EN-DE WMT. Here I have loaded in those parameters to our reimplemenation.
@inproceedings{opennmt, author = {Guillaume Klein and Yoon Kim and Yuntian Deng and Jean Senellart and Alexander M. Rush}, title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation}, booktitle = {Proc. ACL}, year = {2017}, url = {https://doi.org/10.18653/v1/P17-4012}, doi = {10.18653/v1/P17-4012} }