3、解压wiki文本数据
<p>wiki数据链接: <a href="https://dumps.wikimedia.org/zhwiki/">https://dumps.wikimedia.org/zhwiki/</a></p>
<pre><code>python process_wiki.py zhwiki-20181101-pages-articles1.xml-p1p162886.bz2 wiki.zh.text</code></pre>
<pre><code># -*- coding:utf-8 -*-
# Author:Gao
import logging
import os.path
import six
import sys
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) != 3:
print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text")
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
output = open(outp, 'w',encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
# if six.PY3:
# output.write(b' '.join(text).decode('utf-8') + '\n')
# else:
# output.write(space.join(text) + "\n")
output.write(space.join(text) + "\n")
i=i+1
if (i%10000==0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")</code></pre>