8、下载百度图片
<pre><code class="language-python">
# coding=utf-8
"""根据搜索词下载百度图片"""
import re
import sys
import urllib
from urllib.parse import urlencode
import os
import urllib3.request
import requests
def mkdir(path):
# 引入模块
# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def get_onepage_urls(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
if not onepageurl:
print('已到最后一页, 结束')
return [], ''
try:
html = requests.get(onepageurl).text
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
# fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
print(pic_urls)
# print(fanye_urls)
pn1 = pn + 20
url1 = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E4%BA%BA%E8%84%B8&pn=' + str(pn1) + '&gsm=0&ct=&ic=0&lm=-1&width=0&height=0'
# if pn1>1000:
# url1 = ''
fanye_url = url1
return pic_urls, fanye_url
def down_pic(pic_urls):
"""给出图片链接列表, 下载所有图片"""
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 264) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
continue
if __name__ == '__main__':
save_path = 'D:\\baidu_face1'
mkdir(save_path)
os.chdir(save_path)
# keyword = '人脸' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
# url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=人脸'
pn = 20*14
pn_str = str(pn)
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E4%B8%AD%E5%9B%BD%E4%BA%BA%E8%84%B8&pn='+pn_str+'20&gsm=3c&ct=&ic=0&lm=-1&width=0&height=0'
# url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=中国人脸&pn=20&gsm=3c&ct=&ic=0&lm=-1&width=0&height=0'
# url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E4%BA%BA%E8%84%B8&pn='+pn_str+'&gsm=0&ct=&ic=0&lm=-1&width=0&height=0'
# url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
# url_init = url_init_first + '%E4%BA%BA%E8%84%B8'
# urllib3.quote(keyword, safe='/')---'%E4%BA%BA%E8%84%B8'
# print(urlencode(keyword))
all_pic_urls = []
onepage_urls, fanye_url = get_onepage_urls(url)
all_pic_urls.extend(onepage_urls)
fanye_count = 0 # 累计翻页数
while 1:
if fanye_count>10:
break
onepage_urls, fanye_url = get_onepage_urls(fanye_url)
fanye_count += 1
print('第%s页' % fanye_count)
if fanye_url == '' and onepage_urls == []:
break
all_pic_urls.extend(onepage_urls)
down_pic(list(set(all_pic_urls)))
#-*- coding:utf-8 -*-
'''
import urllib.request
from lxml import etree
url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E4%BA%BA%E8%84%B8'
url_2 = 'http://blog.sina.com.cn/s/articlelist_3283485963_4_1.html'
page = urllib.request.urlopen(url)
html = page.read().decode("utf-8")
selector = etree.HTML(html)
print(html)
result_content = selector.xpath('//div/a/img/text()')
for i in result_content:
print(result_content[i])
print('-------------------------------------------------------')
'''
```欢迎使用ShowDoc!</code></pre>