5、词向量
<pre><code class="language-python">"""
This is a implementation of Word2Vec using numpy. Uncomment the print functions to see Word2Vec in action! Also remember to change the number of epochs and set training_data to training_data[0] to avoid flooding your terminal. A Google Sheet implementation of Word2Vec is also available here - https://docs.google.com/spreadsheets/d/1mgf82Ue7MmQixMm2ZqnT1oWUucj6pEcd2wDs_JgHmco/edit?usp=sharing
Have fun learning!
Author: Derek Chia
Email: derek@derekchia.com
"""
# https://ai.yanxishe.com/page/TextTranslation/1317 中译参考
import numpy as np
from collections import defaultdict
## Randomly initialise
# 随机的初始化参数
getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066],
[-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811],
[-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529],
[0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275],
[0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237],
[0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408],
[0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612],
[0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429],
[0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]]
getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
[-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
[-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
[0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
[-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
[0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
[-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757],
[-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395],
[-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425],
[0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]]
class word2vec():
def __init__(self):
self.n = settings['n'] # 10
self.lr = settings['learning_rate'] # 0.01
self.epochs = settings['epochs'] # 50
self.window = settings['window_size'] # 2
def generate_training_data(self, settings, corpus):
# Find unique word counts using dictonary
word_counts = defaultdict(int)
# #*******************************
# 这里row是遍历行
for row in corpus:
for word in row:
word_counts[word] += 1
#########################################################################################################################################################
# row: ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']
# 遍历row 使用defaultdict 计算词出现的个数 生成int 型 dict
# print(word_counts) #
# # defaultdict(<class 'int'>, {'natural': 1, 'language': 1, 'processing': 1, 'and': 2, 'machine': 1, 'learning': 1, 'is': 1, 'fun': 1, 'exciting': 1}) #
#########################################################################################################################################################
# 9个不同的单词
## How many unique words in vocab? 9
self.v_count = len(word_counts.keys())
#########################
# print(self.v_count) #
# 9 #
#########################
# 生成词字典
# Generate Lookup Dictionaries (vocab)
self.words_list = list(word_counts.keys())
#################################################################################################
# print(self.words_list) #
# ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'exciting'] #
#################################################################################################
# 使用enumerate()枚举函数 生成词序号
# enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。
# 下面的i, word是枚举后的下标和数据,利用下标和数据组成字典
# Generate word:index
self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
#############################################################################################################################
# print(self.word_index) #
# # {'natural': 0, 'language': 1, 'processing': 2, 'and': 3, 'machine': 4, 'learning': 5, 'is': 6, 'fun': 7, 'exciting': 8} #
#############################################################################################################################
# 生成序号对应的词 方法同上
# Generate index:word
self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
#############################################################################################################################
# print(self.index_word) #
# {0: 'natural', 1: 'language', 2: 'processing', 3: 'and', 4: 'machine', 5: 'learning', 6: 'is', 7: 'fun', 8: 'exciting'} #
#############################################################################################################################
training_data = []
# 这里sentence是遍历输出每一个元素
# print(sentence)
# ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']
# Cycle through each sentence in corpus
for sentence in corpus:
sent_len = len(sentence)
# Cycle through each word in sentence
for i, word in enumerate(sentence):
# Convert target word to one-hot
# 将目标词转为 onehot 如:sentence[0]: natural转为onehot编码:[1, 0, 0, 0, 0, 0, 0, 0, 0]
w_target = self.word2onehot(sentence[i])
# Cycle through context window
w_context = []
# Note: window_size 2 will have range of 5 values
# 窗口为2时,前后 包括自身 有5个值
# for i in range(1,3) i可以取到 1、 2 取不到3 ,所以下面是 i + self.window+1
for j in range(i - self.window, i + self.window+1):
# Criteria for context word
# 1、目标词不是上下文 体现在(j != i)
# 2、Index必须大于等于0 否则报错
# 3、Index必须小于等于 sentence的长度
# 1. Target word cannot be context word (j != i)
# 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
# 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range
if j != i and j <= sent_len-1 and j >= 0:
# Append the one-hot representation of word to w_context
# 把上下文的 one-hot加入w_context列表中
w_context.append(self.word2onehot(sentence[j]))
# print(sentence[i], sentence[j])
#########################
# Example: natural #
# natural language #
# natural processing #
# ****************** #
# Example: language #
# language natural #
# language processing #
# language append #
#########################
# training_data contains a one-hot representation of the target word and context words
#################################################################################################
# Example: #
# [Target] natural, [Context] language, [Context] processing #
# print(training_data) #
# [[[1, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]]]] #
#################################################################################################
# 返回输出 目标词与上下文 组成的list
training_data.append([w_target, w_context])
return np.array(training_data)
def word2onehot(self, word):
# word_vec - initialise a blank vector
# 初始化一个空向量 0向量 也可以使用np.zeros(self.v_count)
word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count)
#############################
# print(word_vec) #
# [0, 0, 0, 0, 0, 0, 0, 0] #
#############################
# Get ID of word from word_index
# 得到词的id编号
word_index = self.word_index[word]
# Change value from 0 to 1 according to ID of the word
# 根据目标词的编号 把该处0变为1 如词的id为0 即把第一位0变为1
# [1, 0, 0, 0, 0, 0, 0, 0, 0]
word_vec[word_index] = 1
return word_vec
def train(self, training_data):
# Initialising weight matrices
# np.random.uniform(HIGH, LOW, OUTPUT_SHAPE)
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.random.uniform.html
self.w1 = np.array(getW1)
self.w2 = np.array(getW2)
# self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
# self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
# Cycle through each epoch
# 循环迭代epochs个周期
for i in range(self.epochs):
# Intialise loss to 0
self.loss = 0
# Cycle through each training sample
# w_t = vector for target word, w_c = vectors for context words
# 循环迭代每个训练样本 目标向量、及上下文向量
for w_t, w_c in training_data:
# Forward pass
# 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
# 前向传播
y_pred, h, u = self.forward_pass(w_t)
#########################################
# print("Vector for target word:", w_t) #
# print("W1-before backprop", self.w1) #
# print("W2-before backprop", self.w2) #
#########################################
# Calculate error
# 计算误差
# 1. For a target word, calculate difference between y_pred and each of the context words
# 2. Sum up the differences using np.sum to give us the error for this particular target word
EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
#########################
# print("Error", EI) #
#########################
# Backpropagation
# 反向传播 更新权重
# We use SGD to backpropagate errors - calculate loss on the output layer
self.backprop(EI, h, w_t)
#########################################
#print("W1-after backprop", self.w1) #
#print("W2-after backprop", self.w2) #
#########################################
# Calculate loss
# 计算损失
# There are 2 parts to the loss function
# Part 1: -ve sum of all the output +
# Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
# Note: word.index(1) returns the index in the context word vector with value 1
# Note: u[word.index(1)] returns the value of the output layer before softmax
self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
#############################################################
# Break if you want to see weights after first target word #
# break #
#############################################################
print('Epoch:', i, "Loss:", self.loss)
def forward_pass(self, x):
# x is one-hot vector for target word, x shape (9,)
# x是1*9的行向量 self.w1 9*10 得到1*10 的行向量 h
# Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
h = np.dot(x, self.w1)
# Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
# h 1*10 self.w2 10*9 得到 1*9 的行向量 u
u = np.dot(h, self.w2)
# Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
y_c = self.softmax(u)
return y_c, h, u
'''
u
array([ 0.443687, -0.674272, 1.322973, -0.149951, 1.14293 , -2.073111,
-0.635283, 0.516231, 1.21318 ])
softmax(u) 归一化到[0,1]之间
array([0.10044958, 0.03284163, 0.24200101, 0.05547974, 0.20212754,
0.00810805, 0.03414738, 0.10800742, 0.21683765])
'''
def softmax(self, x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
def backprop(self, e, h, x):
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
# Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
# Going backwards, we need to take derivative of E with respect of w2
# h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
# x - shape 9x1, w2 - 10x9, e.T - 9x1
dl_dw2 = np.outer(h, e)
dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
########################################
# print('Delta for w2', dl_dw2) #
# print('Hidden layer', h) #
# print('np.dot', np.dot(self.w2, e.T)) #
# print('Delta for w1', dl_dw1) #
#########################################
# Update weights
# 更新权重
self.w1 = self.w1 - (self.lr * dl_dw1)
self.w2 = self.w2 - (self.lr * dl_dw2)
# Get vector from word
def word_vec(self, word):
# 权重w1是个list w1[w_index]取该list中第几个向量
# 权重list中存的是全部词的向量 所以传进改词的序号后就能得到该词的向量
w_index = self.word_index[word]
v_w = self.w1[w_index]
print(self.w1)
return v_w
# Input vector, returns nearest word(s)
# top_n输出几个相关的词
def vec_sim(self, word, top_n):
# 当前词转为向量
v_w1 = self.word_vec(word)
word_sim = {}
# 遍历词的长度次v_count
for i in range(self.v_count):
# 遍历寻找最相近的
# Find the similary score for each word in vocab
v_w2 = self.w1[i]
theta_sum = np.dot(v_w1, v_w2)
theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
theta = theta_sum / theta_den
word = self.index_word[i]
word_sim[word] = theta
words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)
for word, sim in words_sorted[:top_n]:
print(word, sim)
#####################################################################
# 一些超参数
settings = {
'window_size': 2, # context window +- center word 目标单词左右两端最近的两个单词视为上下文单词
'n': 10, # 单词嵌入的纬度 通常100-300不等dimensions of word embeddings, also refer to size of hidden layer
'epochs': 50, # 训练迭代周期 number of training epochs
'learning_rate': 0.01 # 学习率 learning rate
}
text = "natural language processing and machine learning is fun and exciting"
# 大小写没有关系
# Note the .lower() as upper and lowercase does not matter in our implementation
# 得到的单词列表
# [['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']]
corpus = [[word.lower() for word in text.split()]]
# Initialise object
# 初始化对象
w2v = word2vec()
# 将语料库转换one-hot编码表示 以方便Word2vec模型用来训练
# 生成训练数据
# Numpy ndarray with one-hot representation for [target_word, context_words]
training_data = w2v.generate_training_data(settings, corpus)
print(training_data)
# Training
w2v.train(training_data)
print('****************************')
# Get vector for word
# 得到词的向量
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)
# Find similar words
# 输出最相近的3个单词
w2v.vec_sim("machine", 3)
</code></pre>