python


3、解压wiki文本数据

<p>wiki数据链接: <a href="https://dumps.wikimedia.org/zhwiki/">https://dumps.wikimedia.org/zhwiki/</a></p> <pre><code>python process_wiki.py zhwiki-20181101-pages-articles1.xml-p1p162886.bz2 wiki.zh.text</code></pre> <pre><code># -*- coding:utf-8 -*- # Author:Gao import logging import os.path import six import sys import warnings warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') from gensim.corpora import WikiCorpus if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) != 3: print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text") sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, 'w',encoding='utf-8') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): # if six.PY3: # output.write(b' '.join(text).decode('utf-8') + '\n') # else: # output.write(space.join(text) + "\n") output.write(space.join(text) + "\n") i=i+1 if (i%10000==0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")</code></pre>

页面列表

ITEM_HTML