介绍

elmo是用于解决静态词向量无法一词多义的模型。

在介绍如何实现elmo模型的时候,此处穿插进来Conv1d layer(一维卷积层)

本文代码以plm-nlp-code chp6为准,可直接参考。

Conv1d

介绍

卷积有Conv1d(序列),Conv2d(图像),Conv3d(立体数据),主要区别在于不同方向上进行卷积。因为文字是一维结构的,从而在nlp领域使用Conv1d。

一维卷积适合在句子对于时序结构上体现不重要的方面有更加的优势。比如一句话中关键词位置的变动不影响句子的语义。
但是对时序结构通常效果并不好,因为时间序列通常不满足平移不变的假设。

此处不过多介绍关于Conv1d的原理,感兴趣可看一维卷积tensorflow2版本的Conv1D以及Pytroch的nn.Conv1d用法

此处只关心input和output,卷积核和padding等。

假设o为上一层的hidden_size输出维度,kernel_size为卷积核大小,padding为使用边界填充,stride为步长,那么:
卷积后的维度: (o - kernel_size + 2 * padding) / stride + 1
池化后的维度: (o - kernel_size) / stride + 1

示例

torch中的输入为(batch_size, hidden_size, seq_length)。

  • 步长为1,不使用padding
    1
    2
    3
    4
    5
    6
    7
    8
    m = nn.Conv1d(16, 33, 3, stride=1)
    input = torch.randn(20, 16, 50)
    output = m(input)
    print(output.shape)

    torch.Size([20, 33, 48])
    50 - 3)/ 1 + 1 = 48

  • 步长为2,不使用padding
    1
    2
    3
    4
    5
    6
    7
    8
    m = nn.Conv1d(16, 33, 3, stride=2)
    input = torch.randn(20, 16, 50)
    output = m(input)
    print(output.shape)

    torch.Size([20, 33, 24])

    50 - 3)/ 2 + 1 = 24
  • 步长为1,使用padding
    1
    2
    3
    4
    5
    6
    7
    8
    m = nn.Conv1d(16, 33, 3, stride=1,padding=True)
    input = torch.randn(20, 16, 50)
    output = m(input)
    print(output.shape)
    torch.Size([20, 33, 50])

    50 - 3 + 2)/ 1 + 1 = 50

  • 步长为2,使用padding
    1
    2
    3
    4
    5
    6
    7
    8
    9
    m = nn.Conv1d(16, 33, 3, stride=2, padding=True)
    input = torch.randn(20, 16, 50)
    output = m(input)
    print(output.shape)

    torch.Size([20, 33, 25])

    50 - 3 + 2)/ 2 + 1 = 25

Elmo过程

数据集

构建语料

这地方主要分成两部分:

  1. 构建词和字级别的词典
  2. 通过词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def load_corpus(path, max_tok_len=None, max_seq_len=None):
# Read raw text file
# and build vocabulary for both words and chars
text = []
charset = {BOS_TOKEN, EOS_TOKEN, PAD_TOKEN, BOW_TOKEN, EOW_TOKEN}
print(f"Loading corpus from {path}")
with codecs.open(path, "r", encoding="utf-8") as f:
for line in tqdm(f):
tokens = line.rstrip().split(" ")
if max_seq_len is not None and len(tokens) + 2 > max_seq_len:
tokens = line[:max_seq_len-2]
# 每句话添加<bos>和<eos>
sent = [BOS_TOKEN]
for token in tokens:
if max_tok_len is not None and len(token) + 2 > max_tok_len:
token = token[:max_tok_len-2]
sent.append(token)
# 统计样本中所有字
for ch in token:
charset.add(ch)
sent.append(EOS_TOKEN)
text.append(sent)

# 构造词和单个字级别的词典
print("Building word-level vocabulary")
vocab_w = Vocab.build(
text,
min_freq=2,
reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN]
)
print("Building char-level vocabulary")
vocab_c = Vocab(tokens=list(charset))

# Construct corpus using word_voab and char_vocab
corpus_w = [vocab_w.convert_tokens_to_ids(sent) for sent in text]
corpus_c = []
bow = vocab_c[BOW_TOKEN]
eow = vocab_c[EOW_TOKEN]
for i, sent in enumerate(text):
sent_c = []
for token in sent:
# 对每个token(即词级别)进行分割,构建字级别的
# 添加<bow>和<eow>
if token == BOS_TOKEN or token == EOS_TOKEN:
token_c = [bow, vocab_c[token], eow]
else:
# subtoken
token_c = [bow] + vocab_c.convert_tokens_to_ids(token) + [eow]
sent_c.append(token_c)
assert len(sent_c) == len(corpus_w[i])
corpus_c.append(sent_c)

assert len(corpus_w) == len(corpus_c)
# word level subtoken level
return corpus_w, corpus_c, vocab_w, vocab_c

构建训练数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class BiLMDataset(Dataset):
def __init__(self, corpus_w, corpus_c, vocab_w, vocab_c):
super(BiLMDataset, self).__init__()
self.pad_w = vocab_w[PAD_TOKEN]
self.pad_c = vocab_c[PAD_TOKEN]

self.data = []
for sent_w, sent_c in tqdm(zip(corpus_w, corpus_c)):
self.data.append((sent_w, sent_c))

def __len__(self):
return len(self.data)

def __getitem__(self, i):
return self.data[i]

def collate_fn(self, examples):
# lengths: batch_size
seq_lens = torch.LongTensor([len(ex[0]) for ex in examples])

# inputs_w
inputs_w = [torch.tensor(ex[0]) for ex in examples]
inputs_w = pad_sequence(inputs_w, batch_first=True, padding_value=self.pad_w)

# inputs_c: batch_size * max_seq_len * max_tok_len
batch_size, max_seq_len = inputs_w.shape
max_tok_len = max([max([len(tok) for tok in ex[1]]) for ex in examples])

inputs_c = torch.LongTensor(batch_size, max_seq_len, max_tok_len).fill_(self.pad_c)
for i, (sent_w, sent_c) in enumerate(examples):
for j, tok in enumerate(sent_c):
inputs_c[i][j][:len(tok)] = torch.LongTensor(tok)

# fw_input_indexes, bw_input_indexes = [], []
targets_fw = torch.LongTensor(inputs_w.shape).fill_(self.pad_w)
targets_bw = torch.LongTensor(inputs_w.shape).fill_(self.pad_w)
for i, (sent_w, sent_c) in enumerate(examples):
#
targets_fw[i][:len(sent_w)-1] = torch.LongTensor(sent_w[1:])
targets_bw[i][1:len(sent_w)] = torch.LongTensor(sent_w[:len(sent_w)-1])

return inputs_w, inputs_c, seq_lens, targets_fw, targets_bw

模型结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
BiLM(
(token_embedder): ConvTokenEmbedder(
(char_embeddings): Embedding(2147, 50, padding_idx=1846)
(convolutions): ModuleList(
(0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
(1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
(2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
(3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
(4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
(5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
)
(highways): Highway(
(layers): ModuleList(
(0): Linear(in_features=1024, out_features=2048, bias=True)
(1): Linear(in_features=1024, out_features=2048, bias=True)
)
)
(projection): Linear(in_features=1024, out_features=512, bias=True)
)
(encoder): ELMoLstmEncoder(
(forward_layers): ModuleList(
(0): LSTM(512, 4096, batch_first=True)
(1): LSTM(512, 4096, batch_first=True)
)
(backward_layers): ModuleList(
(0): LSTM(512, 4096, batch_first=True)
(1): LSTM(512, 4096, batch_first=True)
)
(forward_projections): ModuleList(
(0): Linear(in_features=4096, out_features=512, bias=True)
(1): Linear(in_features=4096, out_features=512, bias=True)
)
(backward_projections): ModuleList(
(0): Linear(in_features=4096, out_features=512, bias=True)
(1): Linear(in_features=4096, out_features=512, bias=True)
)
)
(classifier): Linear(in_features=512, out_features=1479, bias=True)
)

这个网络结构和ELMoForManyLangs基本一样,除了没有使用lstm。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Model(
(token_embedder): ConvTokenEmbedder(
(word_emb_layer): EmbeddingLayer(
(embedding): Embedding(71222, 100, padding_idx=3)
)
(char_emb_layer): EmbeddingLayer(
(embedding): Embedding(6169, 50, padding_idx=6166)
)
(convolutions): ModuleList(
(0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
(1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
(2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
(3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
(4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
(5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
(6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
)
(highways): Highway(
(_layers): ModuleList(
(0): Linear(in_features=2048, out_features=4096, bias=True)
(1): Linear(in_features=2048, out_features=4096, bias=True)
)
)
(projection): Linear(in_features=2148, out_features=512, bias=True)
)
(encoder): ElmobiLm(
(forward_layer_0): LstmCellWithProjection(
(input_linearity): Linear(in_features=512, out_features=16384, bias=False)
(state_linearity): Linear(in_features=512, out_features=16384, bias=True)
(state_projection): Linear(in_features=4096, out_features=512, bias=False)
)
(backward_layer_0): LstmCellWithProjection(
(input_linearity): Linear(in_features=512, out_features=16384, bias=False)
(state_linearity): Linear(in_features=512, out_features=16384, bias=True)
(state_projection): Linear(in_features=4096, out_features=512, bias=False)
)
(forward_layer_1): LstmCellWithProjection(
(input_linearity): Linear(in_features=512, out_features=16384, bias=False)
(state_linearity): Linear(in_features=512, out_features=16384, bias=True)
(state_projection): Linear(in_features=4096, out_features=512, bias=False)
)
(backward_layer_1): LstmCellWithProjection(
(input_linearity): Linear(in_features=512, out_features=16384, bias=False)
(state_linearity): Linear(in_features=512, out_features=16384, bias=True)
(state_projection): Linear(in_features=4096, out_features=512, bias=False)
)
)
)

HighWay

这个讲解可以看Highway net。但是这个给我的直观感觉有点类似lstm的门控制机制,用于遗忘与记忆。

ConvTokenEmbedding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class ConvTokenEmbedder(nn.Module):
def __init__(
self,
vocab_c,
char_embedding_dim,
char_conv_filters,
num_highways,
output_dim,
pad="<pad>"
):
super(ConvTokenEmbedder, self).__init__()
self.vocab_c = vocab_c

self.char_embeddings = nn.Embedding(
len(vocab_c),
char_embedding_dim,
padding_idx=vocab_c[pad]
)
self.char_embeddings.weight.data.uniform_(-0.25, 0.25)

self.convolutions = nn.ModuleList()
for kernel_size, out_channels in char_conv_filters:
conv = torch.nn.Conv1d(
in_channels=char_embedding_dim,
out_channels=out_channels,
kernel_size=kernel_size,
bias=True
)
self.convolutions.append(conv)

self.num_filters = sum(f[1] for f in char_conv_filters)
self.num_highways = num_highways
self.highways = Highway(self.num_filters, self.num_highways, activation=F.relu)

self.projection = nn.Linear(self.num_filters, output_dim, bias=True)

def forward(self, inputs):
batch_size, seq_len, token_len = inputs.shape
inputs = inputs.view(batch_size * seq_len, -1)
char_embeds = self.char_embeddings(inputs)
char_embeds = char_embeds.transpose(1, 2)

conv_hiddens = []
for i in range(len(self.convolutions)):
conv_hidden = self.convolutions[i](char_embeds)
conv_hidden, _ = torch.max(conv_hidden, dim=-1)
conv_hidden = F.relu(conv_hidden)
conv_hiddens.append(conv_hidden)

token_embeds = torch.cat(conv_hiddens, dim=-1)
token_embeds = self.highways(token_embeds)
token_embeds = self.projection(token_embeds)
token_embeds = token_embeds.view(batch_size, seq_len, -1)

return token_embeds

这个地方比较有意思,使用不同大小的kernel_size来捕捉上下文信息,这地方没啥可说的。
主要在concat后的操作,比如说不同的kernel_size后的concat到一起后,它到底代表什么含义呢?比如我可以降维到2,表示分类。
后面它用了一个linear以及view重新获得了seq_len每一个token的hidden_size。这地方骚气。

训练过程

太懒了,目前先跳过。。。

简单来说就是使用不同大小的kernel_size的卷积核来捕捉上下文信息,注意是char level的。随后经过一个前向的lstm和一个后向的lstm捕捉不同方向的语义信息。那么从整体网络看来,不同层代表的含义也就比较清晰明了,token embedding更倾向是词法等信息,往后就更倾向深层次的信息,比如语义。那么从使用角度上一是可以按需使用自己的层,二是可以给予不同层不同的权重。

总结

感觉看完了没有很清晰明了的感觉,有可能是中间涉及到太多的转换,elmo解决一词多义的原理可能更多是使用lstm这种以及用char level token来拟合词。

另外elmo不同layer代表的含义不一样,这个在SIFRank中有不同的用法,后续可以关注。