6、爬虫--捕捉HTTP错误
<pre><code>from bs4 import BeautifulSoup
import urllib.request
import requests
from lxml import html
from lxml import etree
import time
def getHtml(url):
try:
req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print('ConnectTimeoutError: ')
print(e)
print(1)
return False
html = page.read().decode("utf-8")
return html
url = 'http://blog.sina.com.cn/s/article_sort_3283485963_10001_2.html'
url_2 = 'http://blog.sina.com.cn/s/articlelist_3283485963_4_1.html'
page = urllib.request.urlopen(url_2)
html = page.read().decode("utf-8")
Soup = BeautifulSoup(html,'lxml')
result_content=Soup.find_all(class_="atc_title")
print (result_content)
# selector = etree.HTML(html)
# result_content = selector.xpath('//a/text()')
i=0
for link in result_content:
print("-------------------------------------------------------")
print(i)
print(link.a.string)
#print(link.a["href"])#查link标签的href值
html = getHtml(link.a["href"])
if (html == False):
time.sleep(5)
print('---网络连接失败,暂停5S,重新连接---')
continue
else:
selector = etree.HTML(html)
result_url = selector.xpath('//a/font/span/text()')
print(result_url[0])
# print(link.string)#查link标签的string
i = i + 1
print(i)
</code></pre>