1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
| import numpy as np from torchtext.vocab import vocab from collections import Counter, OrderedDict from torch.utils.data import Dataset, DataLoader from torchtext.transforms import VocabTransform import torch from torch import nn from torch.nn import functional as F
def get_text(): sentence_list = [ "nlp drives computer programs that translate text from one language to another", "nlp combines computational linguistics rule based modeling of human language with statistical", "nlp model respond to text or voice data and respond with text", ] return sentence_list
class CbowDataSet(Dataset): def __init__(self, text_list, side_window=3): """ 构造Word2vec的CBOW采样Dataset :param text_list: 语料 :param side_window: 单侧正例(构造背景词)采样数,总正例是:2 * side_window """ super(CbowDataSet, self).__init__() self.side_window = side_window text_vocab, vocab_transform = self.reform_vocab(text_list) self.text_list = text_list self.text_vocab = text_vocab self.vocab_transform = vocab_transform self.cbow_data = self.generate_cbow()
def __len__(self): return len(self.cbow_data)
def __getitem__(self, idx): data_row = self.cbow_data[idx] return data_row[0], data_row[1]
def reform_vocab(self, text_list): """根据语料构造torchtext的vocab""" total_word_list = [] for _ in text_list: total_word_list += _.split(" ") counter = Counter(total_word_list) sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) special_token = ["<UNK>", "<SEP>"] text_vocab = vocab(ordered_dict, specials=special_token) text_vocab.set_default_index(0) vocab_transform = VocabTransform(text_vocab) return text_vocab, vocab_transform
def generate_cbow(self): """生成CBOW的训练数据""" cbow_data = [] for sentence in self.text_list: sentence_id_list = np.array(self.vocab_transform(sentence.split(' '))) for center_index in range( self.side_window, len(sentence_id_list) - self.side_window): pos_index = list(range(center_index - self.side_window, center_index + self.side_window + 1)) del pos_index[self.side_window] cbow_data.append([sentence_id_list[center_index], sentence_id_list[pos_index]]) return cbow_data
def get_vocab_transform(self): return self.vocab_transform
def get_vocab_size(self): return len(self.text_vocab)
class Word2VecModel(nn.Module): def __init__(self, vocab_size, batch_size, word_embedding_size=100, hidden=64): """ Word2vec模型CBOW实现 :param vocab_size: 单词个数 :param word_embedding_size: 每个词的词向量维度 :param hidden: 隐层维度 """ super(Word2VecModel, self).__init__() self.vocab_size = vocab_size self.word_embedding_size = word_embedding_size self.hidden = hidden self.batch_size = batch_size self.word_embedding = nn.Embedding(self.vocab_size, self.word_embedding_size) self.linear_in = nn.Linear(self.word_embedding_size, self.hidden) self.linear_out = nn.Linear(self.hidden, self.vocab_size)
def forward(self, input_labels): around_embedding = self.word_embedding(input_labels) avg_around_embedding = torch.mean(around_embedding, dim=1) in_emb = F.relu(self.linear_in(avg_around_embedding)) out_emb = F.log_softmax(self.linear_out(in_emb)) return out_emb
def get_embedding(self, token_list: list): return self.word_embedding(torch.Tensor(token_list).long())
|