python


10、爬取腾讯云实验(一)

<h3>提取网页内的所有链接--并保存到文件中</h3> <pre><code class="language-python">from bs4 import BeautifulSoup import urllib.request from lxml import html as etrhtml # from lxml import etree import time def getHtml(url): try: req = urllib.request.Request(url) page = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print('ConnectTimeoutError: ') print(e) print(1) return False html = page.read().decode("utf-8") return html url = 'http://blog.sina.com.cn/s/article_sort_3283485963_10001_2.html' url_2 = 'http://blog.sina.com.cn/s/articlelist_3283485963_4_1.html' url_all = 'http://blog.sina.com.cn/s/articlelist_3283485963_0_1.html' url_a = 'http://blog.sina.com.cn/s/articlelist_3283485963_0_' i = 50 for j in range(3,4): # print(url_a+str(j)) # page = urllib.request.urlopen(url_a+str(j)+'.html') # page = urllib.request.urlopen('https://cloud.tencent.com/developer/labs/lab/10295') page = urllib.request.urlopen('https://cloud.tencent.com/developer/labs/series/10000') html = page.read().decode("utf-8") Soup = BeautifulSoup(html,'lxml') title_result_content = Soup.find_all(class_="title") for link in title_result_content: print("-------------------------------------------------------") print(link) try: print(link.string) print(link['href']) except TypeError as e: print(e) except KeyError as e: print(e) print("-------------------------------------------------------") # / html / body / div[2] / div[2] / div[1] / div[2] / div[3] / div / div[1] / a # body &gt; div.J-lab-body &gt; div.lab-pdetail-layout &gt; div.lab-main.lab-series-main &gt; div.J-labs.labs &gt; div:nth-child(3) &gt; div &gt; div:nth-child(1) &gt; a # result_content=Soup.find_all(class_="J-doc doc") # for link in result_content: # print("-------------------------------------------------------") # print(i) # print(link.code.string) #print(link.a["href"])#查link标签的href值 # html = getHtml(link.a["href"]) # if (html == False): # time.sleep(5) # print('---网络连接失败,暂停5S,重新连接---') # continue # # else: # selector = etrhtml.etree.HTML(html) # result_url = selector.xpath('//a/font/font/text()') # try: # print(result_url[0]) # except IndexError as e: # print (e) # continue # print(link.string)#查link标签的string # i = i + 1 # page = urllib.request.urlopen(url_pdf) # html = page.read().decode("utf-8") # selector = etree.HTML(html) # result_url = selector.xpath('//a/font/span/text()') # print(result_url[0]) # # print(link.string)#查link标签的string # i = i+1 ''' &lt;a title="" target="_blank" href="http://blog.sina.com.cn/s/blog_c3b6050b0102xeks.html"&gt;《Android高级进阶》【PDF】&lt;/a&gt; //*[@id="module_928"]/div[2]/div[1]/div[1]/div[2] ''' </code></pre>

页面列表

ITEM_HTML