python


6、爬虫--捕捉HTTP错误

<pre><code>from bs4 import BeautifulSoup import urllib.request import requests from lxml import html from lxml import etree import time def getHtml(url): try: req = urllib.request.Request(url) page = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print('ConnectTimeoutError: ') print(e) print(1) return False html = page.read().decode("utf-8") return html url = 'http://blog.sina.com.cn/s/article_sort_3283485963_10001_2.html' url_2 = 'http://blog.sina.com.cn/s/articlelist_3283485963_4_1.html' page = urllib.request.urlopen(url_2) html = page.read().decode("utf-8") Soup = BeautifulSoup(html,'lxml') result_content=Soup.find_all(class_="atc_title") print (result_content) # selector = etree.HTML(html) # result_content = selector.xpath('//a/text()') i=0 for link in result_content: print("-------------------------------------------------------") print(i) print(link.a.string) #print(link.a["href"])#查link标签的href值 html = getHtml(link.a["href"]) if (html == False): time.sleep(5) print('---网络连接失败,暂停5S,重新连接---') continue else: selector = etree.HTML(html) result_url = selector.xpath('//a/font/span/text()') print(result_url[0]) # print(link.string)#查link标签的string i = i + 1 print(i) </code></pre>

页面列表

ITEM_HTML