python


4、python download cjc期刊

<pre><code>import urllib.request import re from lxml import etree import os def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False def check_contain_chinese(check_str): zhmodel = re.compile(u'[\u4e00-\u9fa5]') #检查中文 #zhmodel = re.compile(u'[^\u4e00-\u9fa5]') #检查非中文 contents = check_str match = zhmodel.search(contents) if match: return True return False def down(url,url_number): page = urllib.request.urlopen(url) html = page.read().decode("gb2312") selector = etree.HTML(html) # url urls = re.findall(re.compile(r'&lt;a\r\nhref="(.+?)"&gt;'), html, flags=0) # file_name result_content = selector.xpath('//span/text()') file_names = [] for i in result_content: if len(i) &gt; 7: if check_contain_chinese(i): file_names.append(i) print(i) if len(file_names) == len(urls): for i in range(0,len(urls)): file_path = './' + url_number + '/' mkdir(file_path) urllib.request.urlretrieve(urls[i], file_path + file_names[i] + '.pdf') url = 'http://cjc.ict.ac.cn/qwjs/No2018-01.htm' url_in = 'http://cjc.ict.ac.cn/qwjs/' url_number = 'No2018-0' url_out = '.htm' for i in range(1,10): url = url_in + url_number + str(i) + url_out down(url,url_number+str(i)) </code></pre>

页面列表

ITEM_HTML