读取大文件,并进行shuffle
<p>shuffle大文件</p>
<pre><code>import sys
import json
import numpy as np
import h5py
import linecache
if __name__ == '__main__':
# 输入
# >> python prepare_file.py > zhidao.train_*.json
# shuffle
indices = np.arange(135364)
np.random.shuffle(indices)
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
print(linecache.getline('zhidao.train.json', item))
</code></pre>
<p>方法2:</p>
<pre><code>import sys
import json
import numpy as np
import h5py
import linecache
if __name__ == '__main__':
# shuffle
indices = np.arange(135364)
np.random.shuffle(indices)
with open('zhidao.train_shuffle1.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
fout.write((linecache.getline('zhidao.train.json', item)))
</code></pre>
<p>实例:</p>
<pre><code>import sys
import json
import numpy as np
import h5py
import linecache
if __name__ == '__main__':
# 输入
# >> python prepare_file.py > zhidao.train_*.json
# shuffle
indices = np.arange(135364)
np.random.shuffle(indices)
with open('zhidao.train_1.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 0:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
if id > 33842 * 1:
break
with open('zhidao.train_2.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 1:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
if id > 33842 * 2:
break
with open('zhidao.train_3.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 2:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
if id > 33842 * 3:
break
with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
for id, item in enumerate(indices):
# 输出zhidao.train.json的某一行 到目标json文件
# print(json.dumps(linecache.getline('zhidao.train.json', item)))
if id < 33842 * 3:
continue
fout.write((linecache.getline('zhidao.train.json', item)))
</code></pre>
<p>切割大文件为若干小文件</p>
<pre><code>import sys
import json
if __name__ == '__main__':
with open('./zhidao.train_shuffle.json', encoding='UTF-8') as fin:
with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout:
for lidx, line in enumerate(fin, 1):
# continue
sample = json.loads(line.strip())
fout.write(json.dumps(sample, ensure_ascii=False) + '\n')
# tset_all.append(sample)
print(sample)
if lidx > 10:
break
print(lidx)
</code></pre>