11、数据操作
<p>[TOC]</p>
<pre><code></code></pre>
<h1>read_one_line</h1>
<pre><code>import sys
import json
if __name__ == '__main__':
with open('./zhidao.train_1.json', encoding='UTF-8') as fin:
# with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
for lidx, line in enumerate(fin, 1):
# continue
sample = json.loads(line.strip())
# fout.write(json.dumps(sample, ensure_ascii=False) + '\n')
# tset_all.append(sample)
print(sample)
if lidx > 10:
break
print(lidx)
with open('./zhidao.train_2.json', encoding='UTF-8') as fin:
# with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
for lidx, line in enumerate(fin, 1):
# continue
sample = json.loads(line.strip())
# fout.write(json.dumps(sample, ensure_ascii=False) + '\n')
# tset_all.append(sample)
print(sample)
if lidx > 10:
break
print(lidx)
with open('./zhidao.train_3.json', encoding='UTF-8') as fin:
# with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
for lidx, line in enumerate(fin, 1):
# continue
sample = json.loads(line.strip())
# fout.write(json.dumps(sample, ensure_ascii=False) + '\n')
# tset_all.append(sample)
print(sample)
if lidx > 10:
break
print(lidx)
# print(len(tset_all))
# tset_all = []
# with open('./zhidao.train.json', encoding='UTF-8') as fin:
# with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
# i = 0
# for lidx, line in enumerate(fin, 1):
# # continue
# sample = json.loads(line.strip())
# if len(sample['answer_spans']) == 0:
# continue
# # 待定-----------------------------略去passage长度为500的
# if sample['answer_spans'][0][1] >= 500:
# continue
# i = i +1
# if lidx < 33842 * 3:
# continue
# fout.write(json.dumps(sample, ensure_ascii=False) + '\n')
#
# # if lidx > 33842 * 3:
# # break
# # tset_all.append(sample)
# # print(sample)
# # if lidx > 10:
# # break
# print(lidx)
# print(i)
# 135366
# print(len(tset_all))
# with open('./test.predicted.json', encoding='UTF-8') as fin:
# for lidx, line in enumerate(fin, 1):
# # continue
# sample = json.loads(line.strip())
# tset_all.append(sample)
# # print(sample)
# # if lidx > 10:
# # break
# print(lidx)
# print(len(tset_all))
# with open('test_result_0515.json', 'w',encoding='UTF-8') as fout:
# for pred_answer in tset_all:
# fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')</code></pre>
<h1>read_line</h1>
<pre><code>import sys
import json
if __name__ == '__main__':
tset_all = []
with open('./test.predicted3.json', encoding='UTF-8') as fin:
for lidx, line in enumerate(fin, 1):
# continue
sample = json.loads(line.strip())
tset_all.append(sample)
print(lidx)
print(len(tset_all))
with open('./test.predicted4.json', encoding='UTF-8') as fin:
for lidx, line in enumerate(fin, 1):
# continue
sample = json.loads(line.strip())
tset_all.append(sample)
print(lidx)
print(len(tset_all))
# with open('./test.predicted5.json', encoding='UTF-8') as fin:
# for lidx, line in enumerate(fin, 1):
# # continue
# sample = json.loads(line.strip())
# tset_all.append(sample)
# print(lidx)
# print(len(tset_all))
# with open('./test.predicted6.json', encoding='UTF-8') as fin:
# for lidx, line in enumerate(fin, 1):
# # continue
# sample = json.loads(line.strip())
# tset_all.append(sample)
# print(lidx)
# print(len(tset_all))
with open('test_result_0519-1.json', 'w',encoding='UTF-8') as fout:
for pred_answer in tset_all:
fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
# 输入
# >> cat qq.json | python prepare_file.py > qq1.json
# 读取qq.json做为输入 经过 prepare_file 中的操作,所有的print输出做为qq1.json中的文件
# with open('./zhidao.train.json', encoding='UTF-8') as fin:
# with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
# i = 0
# for lidx, line in enumerate(fin, 1):
# # continue
# sample = json.loads(line.strip())
# if len(sample['answer_spans']) == 0:
# continue
# # 待定-----------------------------略去passage长度为500的
# if sample['answer_spans'][0][1] >= 500:
# continue
# i = i +1
# if lidx < 33842 * 3:
# continue
# fout.write(json.dumps(sample, ensure_ascii=False) + '\n')
# print(lidx)
# print(i)
# print(len(tset_all))
</code></pre>
<h1>cut_file</h1>
<pre><code>import sys
import json
import numpy as np
import h5py
import linecache
if __name__ == '__main__':
# 输入
# >> python prepare_file.py > zhidao.train_*.json
# shuffle
indices = np.arange(135364)
np.random.shuffle(indices)
with open('zhidao.train_1.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 0:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
if id > 33842 * 1:
break
with open('zhidao.train_2.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 1:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
if id > 33842 * 2:
break
with open('zhidao.train_3.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 2:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
if id > 33842 * 3:
break
with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 3:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
# if id > 33842 * 1:
# break
# file = open('./search.train.json', encoding='UTF-8')
#
# linecount = len(file.readlines())
# print(linecount)
# # linecache.getline('3_2.txt', linecount)
# indices = np.arange(135364)
# np.random.shuffle(indices)
# # str = linecache.getlines('zhidao.train.json')
# # with open('zhidao.train_shuffle.json', 'w', encoding='UTF-8') as fout:
# for id, item in enumerate(indices):
# print(item)
# fout.write(json.dumps(linecache.getline('zhidao.train.json', item), ensure_ascii=False) + '\n')
# print(linecache.getline('zhidao.train.json', item))
# if id > 3:
# break
# 输入
# >> cat qq.json | python prepare_file.py > qq1.json
# 读取qq.json做为输入 经过 prepare_file 中的操作,所有的print输出做为qq1.json中的文件
# i = 0
# b = 0
# indices = np.arange(128124)
# np.random.shuffle(indices)
#
# with open('./train.json', encoding='UTF-8') as fin:
# data_set = []
# for line in fin:
#
# print(linecache.getline('url.txt', 2))
# sample = json.loads(line.strip())
# data_set.append(sample)
# for item,id in enumerate(indices):
# if item < 32031:
# print(json.dumps(data_set[id], ensure_ascii=False))
# f = h5py.File('train_set.h5', 'w')
# for lineid, line in enumerate(sys.stdin):
# sample = json.loads(line)
# f[str(lineid + 1)] = sample
# # line 是lineid 这一行的内容
# # item 是shuffle后的行号
# # for id, item in enumerate(indices):
# # if lineid == item :
# # sample = json.loads(line)
# # f[id] = sample
# # # print(json.dumps(sample, ensure_ascii=False))
# b = 0
# for id, item in enumerate(indices):
# print(b)
# if b < 32031:
# # print(id,': ',item)
# i = 0
# for lineid, line in enumerate(sys.stdin):
# # line 是lineid 这一行的内容
# # item 是shuffle后的行号
# if i == item:
# sample = json.loads(line)
# print(json.dumps(sample, ensure_ascii=False))
# i = i + 1
# b = b +1
# print(indices[1:100])
# for line in sys.stdin:
# b = b + 1
# # sample = json.loads(line)
# # if len(sample['answer_spans']) == 0:
# # continue
# # if sample['answer_spans'][0][1] >= 500:
# # continue
# # i=i+1
# # print(json.dumps(sample, ensure_ascii=False))
# print(b)
</code></pre>
<h1>add_file</h1>
<pre><code>import json
import re
import hashlib
keys_set = set()
sample_list = []
# YES_NO ENTITY DESCRIPTION
# with open('./pre_YES_NO_1.json',encoding='UTF-8') as fin:
# for lidx, line in enumerate(fin):
# hash_key = hashlib.md5(str(json.loads(line.strip())['question']).encode(encoding='UTF-8')).hexdigest()
# if hash_key in keys_set:
# continue
# keys_set.add(hash_key)
# sample_list.append(json.loads(line.strip()))
# with open('./pre_YES_NO_2.json',encoding='UTF-8') as fin:
# for lidx, line in enumerate(fin):
# hash_key = hashlib.md5(str(json.loads(line.strip())['question']).encode(encoding='UTF-8')).hexdigest()
# if hash_key in keys_set:
# continue
# keys_set.add(hash_key)
# sample_list.append(json.loads(line.strip()))
i = 0
with open('./search.train.json',encoding='UTF-8') as fin:
for lidx, line in enumerate(fin):
i = i + 1
# if json.loads(line.strip())['question_type'] == 'ENTITY':
hash_key = hashlib.md5(str(json.loads(line.strip())['question']).encode(encoding='UTF-8')).hexdigest()
if hash_key in keys_set:
continue
keys_set.add(hash_key)
# sample_list.append(json.loads(line.strip()))
with open('./zhidao.train.json',encoding='UTF-8') as fin:
for lidx, line in enumerate(fin):
i = i + 1
# if json.loads(line.strip())['question_type'] == 'ENTITY':
hash_key = hashlib.md5(str(json.loads(line.strip())['question']).encode(encoding='UTF-8')).hexdigest()
if hash_key in keys_set:
continue
keys_set.add(hash_key)
# sample_list.append(json.loads(line.strip()))
print('i: ',i)
print(len(keys_set))
# with open('./train.json', 'w',encoding='UTF-8') as fout:
# for pred_answer in sample_list:
# fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
</code></pre>