4、python download cjc期刊
<pre><code>import urllib.request
import re
from lxml import etree
import os
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def check_contain_chinese(check_str):
zhmodel = re.compile(u'[\u4e00-\u9fa5]') #检查中文
#zhmodel = re.compile(u'[^\u4e00-\u9fa5]') #检查非中文
contents = check_str
match = zhmodel.search(contents)
if match:
return True
return False
def down(url,url_number):
page = urllib.request.urlopen(url)
html = page.read().decode("gb2312")
selector = etree.HTML(html)
# url
urls = re.findall(re.compile(r'<a\r\nhref="(.+?)">'), html, flags=0)
# file_name
result_content = selector.xpath('//span/text()')
file_names = []
for i in result_content:
if len(i) > 7:
if check_contain_chinese(i):
file_names.append(i)
print(i)
if len(file_names) == len(urls):
for i in range(0,len(urls)):
file_path = './' + url_number + '/'
mkdir(file_path)
urllib.request.urlretrieve(urls[i], file_path + file_names[i] + '.pdf')
url = 'http://cjc.ict.ac.cn/qwjs/No2018-01.htm'
url_in = 'http://cjc.ict.ac.cn/qwjs/'
url_number = 'No2018-0'
url_out = '.htm'
for i in range(1,10):
url = url_in + url_number + str(i) + url_out
down(url,url_number+str(i))
</code></pre>