10、爬取腾讯云实验(一)
<h3>提取网页内的所有链接--并保存到文件中</h3>
<pre><code class="language-python">from bs4 import BeautifulSoup
import urllib.request
from lxml import html as etrhtml
# from lxml import etree
import time
def getHtml(url):
try:
req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print('ConnectTimeoutError: ')
print(e)
print(1)
return False
html = page.read().decode("utf-8")
return html
url = 'http://blog.sina.com.cn/s/article_sort_3283485963_10001_2.html'
url_2 = 'http://blog.sina.com.cn/s/articlelist_3283485963_4_1.html'
url_all = 'http://blog.sina.com.cn/s/articlelist_3283485963_0_1.html'
url_a = 'http://blog.sina.com.cn/s/articlelist_3283485963_0_'
i = 50
for j in range(3,4):
# print(url_a+str(j))
# page = urllib.request.urlopen(url_a+str(j)+'.html')
# page = urllib.request.urlopen('https://cloud.tencent.com/developer/labs/lab/10295')
page = urllib.request.urlopen('https://cloud.tencent.com/developer/labs/series/10000')
html = page.read().decode("utf-8")
Soup = BeautifulSoup(html,'lxml')
title_result_content = Soup.find_all(class_="title")
for link in title_result_content:
print("-------------------------------------------------------")
print(link)
try:
print(link.string)
print(link['href'])
except TypeError as e:
print(e)
except KeyError as e:
print(e)
print("-------------------------------------------------------")
# / html / body / div[2] / div[2] / div[1] / div[2] / div[3] / div / div[1] / a
# body > div.J-lab-body > div.lab-pdetail-layout > div.lab-main.lab-series-main > div.J-labs.labs > div:nth-child(3) > div > div:nth-child(1) > a
# result_content=Soup.find_all(class_="J-doc doc")
# for link in result_content:
# print("-------------------------------------------------------")
# print(i)
# print(link.code.string)
#print(link.a["href"])#查link标签的href值
# html = getHtml(link.a["href"])
# if (html == False):
# time.sleep(5)
# print('---网络连接失败,暂停5S,重新连接---')
# continue
#
# else:
# selector = etrhtml.etree.HTML(html)
# result_url = selector.xpath('//a/font/font/text()')
# try:
# print(result_url[0])
# except IndexError as e:
# print (e)
# continue
# print(link.string)#查link标签的string
# i = i + 1
# page = urllib.request.urlopen(url_pdf)
# html = page.read().decode("utf-8")
# selector = etree.HTML(html)
# result_url = selector.xpath('//a/font/span/text()')
# print(result_url[0])
# # print(link.string)#查link标签的string
# i = i+1
'''
<a title="" target="_blank" href="http://blog.sina.com.cn/s/blog_c3b6050b0102xeks.html">《Android高级进阶》【PDF】</a>
//*[@id="module_928"]/div[2]/div[1]/div[1]/div[2]
'''
</code></pre>