python


读取大文件,并进行shuffle

<p>shuffle大文件</p> <pre><code>import sys import json import numpy as np import h5py import linecache if __name__ == '__main__': # 输入 # &gt;&gt; python prepare_file.py &gt; zhidao.train_*.json # shuffle indices = np.arange(135364) np.random.shuffle(indices) for id, item in enumerate(indices): # 输出zhidao.train.json的某一行 到目标json文件 print(linecache.getline('zhidao.train.json', item)) </code></pre> <p>方法2:</p> <pre><code>import sys import json import numpy as np import h5py import linecache if __name__ == '__main__': # shuffle indices = np.arange(135364) np.random.shuffle(indices) with open('zhidao.train_shuffle1.json', 'w', encoding='UTF-8') as fout: for id, item in enumerate(indices): # 输出zhidao.train.json的某一行 到目标json文件 # print(json.dumps(linecache.getline('zhidao.train.json', item))) fout.write((linecache.getline('zhidao.train.json', item))) </code></pre> <p>实例:</p> <pre><code>import sys import json import numpy as np import h5py import linecache if __name__ == '__main__': # 输入 # &gt;&gt; python prepare_file.py &gt; zhidao.train_*.json # shuffle indices = np.arange(135364) np.random.shuffle(indices) with open('zhidao.train_1.json', 'w', encoding='UTF-8') as fout: for id, item in enumerate(indices): # 输出zhidao.train.json的某一行 到目标json文件 # print(json.dumps(linecache.getline('zhidao.train.json', item))) if id &lt; 33842 * 0: continue fout.write((linecache.getline('zhidao.train.json', item))) if id &gt; 33842 * 1: break with open('zhidao.train_2.json', 'w', encoding='UTF-8') as fout: for id, item in enumerate(indices): # 输出zhidao.train.json的某一行 到目标json文件 # print(json.dumps(linecache.getline('zhidao.train.json', item))) if id &lt; 33842 * 1: continue fout.write((linecache.getline('zhidao.train.json', item))) if id &gt; 33842 * 2: break with open('zhidao.train_3.json', 'w', encoding='UTF-8') as fout: for id, item in enumerate(indices): # 输出zhidao.train.json的某一行 到目标json文件 # print(json.dumps(linecache.getline('zhidao.train.json', item))) if id &lt; 33842 * 2: continue fout.write((linecache.getline('zhidao.train.json', item))) if id &gt; 33842 * 3: break with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout: for id, item in enumerate(indices): # 输出zhidao.train.json的某一行 到目标json文件 # print(json.dumps(linecache.getline('zhidao.train.json', item))) if id &lt; 33842 * 3: continue fout.write((linecache.getline('zhidao.train.json', item))) </code></pre> <p>切割大文件为若干小文件</p> <pre><code>import sys import json if __name__ == '__main__': with open('./zhidao.train_shuffle.json', encoding='UTF-8') as fin: with open('zhidao.train_4.json', 'w', encoding='UTF-8') as fout: for lidx, line in enumerate(fin, 1): # continue sample = json.loads(line.strip()) fout.write(json.dumps(sample, ensure_ascii=False) + '\n') # tset_all.append(sample) print(sample) if lidx &gt; 10: break print(lidx) </code></pre>

页面列表

ITEM_HTML