11、提取elmo、bert词向量
<pre><code>import os
import pickle
#import numpy as np
import h5py
import hashlib
import json
from bert_serving.client import BertClient
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
from elmoformanylangs import Embedder
e = Embedder('F:\\github_qa\\ELMoForManyLangs\\zhs.model')
pyltp_flag_embed_path = './data/embedding/flag_embed100_pyltp.wv'
pyltp_token_embed_path = './data/embedding/token_embed300_pyltp.wv'
jieba_token_embed_path = './data/embedding/token_embed300_jieba.wv'
pyltp_token_wv = KeyedVectors.load(pyltp_token_embed_path)
jieba_token_wv = KeyedVectors.load(jieba_token_embed_path)
word_list = pyltp_token_wv.index2word
word_list.append('<unk>')
word_list.append('<pad>')
emb_list = []
for id, item in enumerate(word_list):
try:
embedding_np = e.sents2elmo([item])[0][0]
except Exception as e:
print(e)
embedding_np = e.sents2elmo(['<unk>'])[0][0]
emb_list.append(embedding_np)
if id % 1000 == 1:
print(id)
# if id > 100:
# break
emb_list_np = np.array(emb_list)
print(emb_list_np.shape)
pickle.dump(word_list, open('./elmo.pyltp.index2word.pkl', 'wb'))
pickle.dump(emb_list_np, open('./elmo.pyltp.embedding.pkl', 'wb'))
# sents = ['今']
# a = e.sents2elmo(sents)
# a[0][0]
## **************************************************************
#
# pyltp_flag_embed_path = './data/embedding/flag_embed100_pyltp.wv'
# pyltp_token_embed_path = './data/embedding/token_embed300_pyltp.wv'
#
# jieba_token_embed_path = './data/embedding/token_embed300_jieba.wv'
#
# pyltp_token_wv = KeyedVectors.load(pyltp_token_embed_path)
# jieba_token_wv = KeyedVectors.load(jieba_token_embed_path)
#
# word_list = pyltp_token_wv.index2word
# # word_list = jieba_token_wv.index2word
#
# print(word_list[1:5])
# bc = BertClient()
# emb_list = []
#
# word_list.append('<unk>')
# word_list.append('<pad>')
#
# # embedding_np = bc.encode(['<unk>'])
# # emb_list.append(embedding_np)
# # embedding_np = bc.encode(['<pad>'])
# # emb_list.append(embedding_np)
#
#
# for id, item in enumerate(word_list):
# try:
# embedding_np = bc.encode([item])[0]
# except Exception as e:
# print(e)
# embedding_np =bc.encode(['<unk>'])[0]
# emb_list.append(embedding_np)
# if id % 1000 == 1:
# print(id)
# # if id > 100:
# # break
# emb_list_np = np.array(emb_list)
# print(emb_list_np.shape)
#
# pickle.dump(word_list, open('./bert.pyltp.index2word.pkl', 'wb'))
# pickle.dump(emb_list_np, open('./bert.pyltp.embedding.pkl', 'wb'))
#
# # *******************************************************************
</code></pre>