python


11、提取elmo、bert词向量

<pre><code>import os import pickle #import numpy as np import h5py import hashlib import json from bert_serving.client import BertClient from gensim.models import Word2Vec, KeyedVectors import numpy as np from elmoformanylangs import Embedder e = Embedder('F:\\github_qa\\ELMoForManyLangs\\zhs.model') pyltp_flag_embed_path = './data/embedding/flag_embed100_pyltp.wv' pyltp_token_embed_path = './data/embedding/token_embed300_pyltp.wv' jieba_token_embed_path = './data/embedding/token_embed300_jieba.wv' pyltp_token_wv = KeyedVectors.load(pyltp_token_embed_path) jieba_token_wv = KeyedVectors.load(jieba_token_embed_path) word_list = pyltp_token_wv.index2word word_list.append('&lt;unk&gt;') word_list.append('&lt;pad&gt;') emb_list = [] for id, item in enumerate(word_list): try: embedding_np = e.sents2elmo([item])[0][0] except Exception as e: print(e) embedding_np = e.sents2elmo(['&lt;unk&gt;'])[0][0] emb_list.append(embedding_np) if id % 1000 == 1: print(id) # if id &gt; 100: # break emb_list_np = np.array(emb_list) print(emb_list_np.shape) pickle.dump(word_list, open('./elmo.pyltp.index2word.pkl', 'wb')) pickle.dump(emb_list_np, open('./elmo.pyltp.embedding.pkl', 'wb')) # sents = ['今'] # a = e.sents2elmo(sents) # a[0][0] ## ************************************************************** # # pyltp_flag_embed_path = './data/embedding/flag_embed100_pyltp.wv' # pyltp_token_embed_path = './data/embedding/token_embed300_pyltp.wv' # # jieba_token_embed_path = './data/embedding/token_embed300_jieba.wv' # # pyltp_token_wv = KeyedVectors.load(pyltp_token_embed_path) # jieba_token_wv = KeyedVectors.load(jieba_token_embed_path) # # word_list = pyltp_token_wv.index2word # # word_list = jieba_token_wv.index2word # # print(word_list[1:5]) # bc = BertClient() # emb_list = [] # # word_list.append('&lt;unk&gt;') # word_list.append('&lt;pad&gt;') # # # embedding_np = bc.encode(['&lt;unk&gt;']) # # emb_list.append(embedding_np) # # embedding_np = bc.encode(['&lt;pad&gt;']) # # emb_list.append(embedding_np) # # # for id, item in enumerate(word_list): # try: # embedding_np = bc.encode([item])[0] # except Exception as e: # print(e) # embedding_np =bc.encode(['&lt;unk&gt;'])[0] # emb_list.append(embedding_np) # if id % 1000 == 1: # print(id) # # if id &gt; 100: # # break # emb_list_np = np.array(emb_list) # print(emb_list_np.shape) # # pickle.dump(word_list, open('./bert.pyltp.index2word.pkl', 'wb')) # pickle.dump(emb_list_np, open('./bert.pyltp.embedding.pkl', 'wb')) # # # ******************************************************************* </code></pre>

页面列表

ITEM_HTML