python


8、下载百度图片

<pre><code class="language-python"> # coding=utf-8 """根据搜索词下载百度图片""" import re import sys import urllib from urllib.parse import urlencode import os import urllib3.request import requests def mkdir(path): # 引入模块 # 去除首位空格 path = path.strip() # 去除尾部 \ 符号 path = path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists = os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 # 创建目录操作函数 os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False def get_onepage_urls(onepageurl): """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url""" if not onepageurl: print('已到最后一页, 结束') return [], '' try: html = requests.get(onepageurl).text except Exception as e: print(e) pic_urls = [] fanye_url = '' return pic_urls, fanye_url pic_urls = re.findall('"objURL":"(.*?)",', html, re.S) # fanye_urls = re.findall(re.compile(r'&lt;a href="(.*)" class="n"&gt;下一页&lt;/a&gt;'), html, flags=0) print(pic_urls) # print(fanye_urls) pn1 = pn + 20 url1 = 'http://image.baidu.com/search/flip?tn=baiduimage&amp;ie=utf-8&amp;word=%E4%BA%BA%E8%84%B8&amp;pn=' + str(pn1) + '&amp;gsm=0&amp;ct=&amp;ic=0&amp;lm=-1&amp;width=0&amp;height=0' # if pn1&gt;1000: # url1 = '' fanye_url = url1 return pic_urls, fanye_url def down_pic(pic_urls): """给出图片链接列表, 下载所有图片""" for i, pic_url in enumerate(pic_urls): try: pic = requests.get(pic_url, timeout=15) string = str(i + 264) + '.jpg' with open(string, 'wb') as f: f.write(pic.content) print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url))) except Exception as e: print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url))) print(e) continue if __name__ == '__main__': save_path = 'D:\\baidu_face1' mkdir(save_path) os.chdir(save_path) # keyword = '人脸' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样 # url = 'http://image.baidu.com/search/index?tn=baiduimage&amp;ps=1&amp;ct=201326592&amp;lm=-1&amp;cl=2&amp;nc=1&amp;ie=utf-8&amp;word=人脸' pn = 20*14 pn_str = str(pn) url = 'http://image.baidu.com/search/flip?tn=baiduimage&amp;ie=utf-8&amp;word=%E4%B8%AD%E5%9B%BD%E4%BA%BA%E8%84%B8&amp;pn='+pn_str+'20&amp;gsm=3c&amp;ct=&amp;ic=0&amp;lm=-1&amp;width=0&amp;height=0' # url = 'http://image.baidu.com/search/flip?tn=baiduimage&amp;ie=utf-8&amp;word=中国人脸&amp;pn=20&amp;gsm=3c&amp;ct=&amp;ic=0&amp;lm=-1&amp;width=0&amp;height=0' # url = 'http://image.baidu.com/search/flip?tn=baiduimage&amp;ie=utf-8&amp;word=%E4%BA%BA%E8%84%B8&amp;pn='+pn_str+'&amp;gsm=0&amp;ct=&amp;ic=0&amp;lm=-1&amp;width=0&amp;height=0' # url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&amp;ipn=r&amp;ct=201326592&amp;cl=2&amp;lm=-1&amp;st=-1&amp;fm=result&amp;fr=&amp;sf=1&amp;fmq=1497491098685_R&amp;pv=&amp;ic=0&amp;nc=1&amp;z=&amp;se=1&amp;showtab=0&amp;fb=0&amp;width=&amp;height=&amp;face=0&amp;istype=2&amp;ie=utf-8&amp;ctd=1497491098685%5E00_1519X735&amp;word=' # url_init = url_init_first + '%E4%BA%BA%E8%84%B8' # urllib3.quote(keyword, safe='/')---'%E4%BA%BA%E8%84%B8' # print(urlencode(keyword)) all_pic_urls = [] onepage_urls, fanye_url = get_onepage_urls(url) all_pic_urls.extend(onepage_urls) fanye_count = 0 # 累计翻页数 while 1: if fanye_count&gt;10: break onepage_urls, fanye_url = get_onepage_urls(fanye_url) fanye_count += 1 print('第%s页' % fanye_count) if fanye_url == '' and onepage_urls == []: break all_pic_urls.extend(onepage_urls) down_pic(list(set(all_pic_urls))) #-*- coding:utf-8 -*- ''' import urllib.request from lxml import etree url = 'http://image.baidu.com/search/index?tn=baiduimage&amp;ps=1&amp;ct=201326592&amp;lm=-1&amp;cl=2&amp;nc=1&amp;ie=utf-8&amp;word=%E4%BA%BA%E8%84%B8' url_2 = 'http://blog.sina.com.cn/s/articlelist_3283485963_4_1.html' page = urllib.request.urlopen(url) html = page.read().decode("utf-8") selector = etree.HTML(html) print(html) result_content = selector.xpath('//div/a/img/text()') for i in result_content: print(result_content[i]) print('-------------------------------------------------------') ''' ```欢迎使用ShowDoc!</code></pre>

页面列表

ITEM_HTML