11、爬取腾讯云实验室数据(二)
<h3>1、使用PhantomJS打开网页、截图、提取html转为markdown格式、保存到本地、以及模拟浏览器点击操作等</h3>
<pre><code class="language-python">from selenium import webdriver
from bs4 import BeautifulSoup
from tomd import Tomd
# 保存文件--名字--内容---默认保存到当前工作目录
# save_to_file('test.md','测试保存内容')
def save_to_file(file_name,contents):
fh = open(file_name, 'w',encoding='utf-8')
fh.write(contents)
fh.close()
def get_html_code():
url = "https://cloud.tencent.com/developer/labs/lab/10324"
# ----------------------------------------------------
# 打开浏览器 - 打开页面 - 提取页面所有code标签的内容
browser = webdriver.PhantomJS()
browser.get(url) # 打开网ye
# 网页源码----所有html标签内容
# print(browser.page_source)
Soup = BeautifulSoup(browser.page_source, 'lxml')
links = Soup.find_all("code") # 提取所有code标签
# for link in links:
# print(link.text)
# print('-------')
# 打开浏览器 - 打开页面 - 提取页面所有code标签的内容
# ----------------------------------------------------
return links
def get_html_doc():
url = "https://cloud.tencent.com/developer/labs/lab/10324"
# ----------------------------------------------------
# 打开浏览器 - 打开页面 - 提取页面所有J-doc doc标签的内容
browser = webdriver.PhantomJS()
browser.get(url) # 打开网ye
# 网页源码----所有html标签内容
# print(browser.page_source)
Soup = BeautifulSoup(browser.page_source, 'lxml')
links = Soup.find_all(class_="J-doc doc") # 提取J-doc doc标签
# for link in links:
# print(link.text)
# print('-------')
# 打开浏览器 - 打开页面 - 提取页面所有J-doc doc标签的内容
# ----------------------------------------------------
return links
def get_screenshot_from_click(url):
# url = "https://cloud.tencent.com/developer/labs/lab/10324"
# ----------------------------------------------------
# 打开浏览器 - 打开页面 - 点击某个标签 - 截图
browser = webdriver.PhantomJS()
browser.get(url) # 打开网ye
browser.find_element_by_css_selector("[class='item second']").click() # 模仿点击事件---点击实验手册
a = browser.get_screenshot_as_file("D:/2_PDF_Download/test2.png") # 屏幕截图
# 打开浏览器 - 打开页面 - 点击某个标签 - 截图
# ----------------------------------------------------
return 'get_screenshot_from_click--ok'
# 用于实现自己拼接 markdown语法中的代码块
def get_code_to_markdown_file():
begin_three_mark = '```python\n'
end_three_mark = '```'
code_str = begin_three_mark
try:
codes = get_html_code()
for code in codes:
code_str = code_str + str(code.text) + end_three_mark + '\n\n' + begin_three_mark
except Exception as e:
print(e)
code_str = code_str + end_three_mark
print(code_str)
# 保存文件
save_to_file('code.md', code_str)
return 'get_code_to_markdown_file--ok'
def html_to_markdown():
# 用到 from tomd import Tomd
string_code = str(get_html_doc())
print(Tomd(string_code).markdown)
save_to_file('code1.md', str(Tomd(string_code).markdown))
return "html_to_markdown - ok"
</code></pre>
<h3>2、urllib访问网页提取字符串保存到本地</h3>
<pre><code class="language-python"># coding:utf-8
import urllib
from bs4 import BeautifulSoup
import urllib.request
# 保存文件--名字--内容---默认保存到当前工作目录
# save_to_file('test.md','测试保存内容')
def save_to_file(file_name,contents):
fh = open(file_name, 'w',encoding='utf-8')
fh.write(contents)
fh.close()
# 从连接中提取字符串--并保存到文本内
def download_md(url,file_name):
page = urllib.request.urlopen(url)
html = page.read().decode("utf-8")
Soup = BeautifulSoup(html, 'lxml')
result_content = Soup.find_all(class_="J-doc doc")
try:
save_to_file(file_name, result_content[0].code.string)
except Exception as e:
print(e)
url = 'https://cloud.tencent.com/developer/labs/lab/10324'
page = urllib.request.urlopen('https://cloud.tencent.com/developer/labs/lab/10324')
html = page.read().decode("utf-8")
Soup = BeautifulSoup(html,'lxml')
result_content=Soup.find_all(class_="J-doc doc")
# print(result_content[0].code.string)
file_name = 'TensorFlow - 相关 API'+'.md'
print(Soup)
try:
save_to_file(file_name,result_content[0].code.string)
except Exception as e:
print (e)
</code></pre>
<h3>3、剔除html标签--保留其中的文本信息</h3>
<pre><code class="language-python"># -*- coding: utf-8-*-
import re
from selenium import webdriver
def get_html():
url = "https://cloud.tencent.com/developer/labs/lab/10324"
# ----------------------------------------------------
# 打开浏览器 - 打开页面 - 提取页面所有code标签的内容
browser = webdriver.PhantomJS()
browser.get(url) # 打开网ye
# 网页源码----所有html标签内容
# print(browser.page_source)
# 打开浏览器 - 打开页面 - 提取页面所有code标签的内容
# ----------------------------------------------------
return browser.page_source
##过滤HTML中的标签
# 将HTML中标签等信息去掉
# @param htmlstr HTML字符串.
def filter_tags(htmlstr):
# 先过滤CDATA
re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) # 匹配CDATA
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style
re_br = re.compile('<br\s*?/?>') # 处理换行
re_h = re.compile('</?\w+[^>]*>') # HTML标签
re_comment = re.compile('<!--[^>]*-->') # HTML注释
s = re_cdata.sub('', htmlstr) # 去掉CDATA
s = re_script.sub('', s) # 去掉SCRIPT
s = re_style.sub('', s) # 去掉style
s = re_br.sub('\n', s) # 将br转换为换行
s = re_h.sub('', s) # 去掉HTML 标签
s = re_comment.sub('', s) # 去掉HTML注释
# 去掉多余的空行
blank_line = re.compile('\n+')
s = blank_line.sub('\n', s)
s = replaceCharEntity(s) # 替换实体
return s
##替换常用HTML字符实体.
# 使用正常的字符替换HTML中特殊的字符实体.
# 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
# @param htmlstr HTML字符串.
def replaceCharEntity(htmlstr):
CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"', '34': '"', }
re_charEntity = re.compile(r'&#?(?P<name>\w+);')
sz = re_charEntity.search(htmlstr)
while sz:
entity = sz.group() # entity全称,如>
key = sz.group('name') # 去除&;后entity,如>为gt
try:
htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
sz = re_charEntity.search(htmlstr)
except KeyError:
# 以空串代替
htmlstr = re_charEntity.sub('', htmlstr, 1)
sz = re_charEntity.search(htmlstr)
return htmlstr
def repalce(s, re_exp, repl_string):
return re_exp.sub(repl_string, s)
if __name__ == '__main__':
news = filter_tags(str(get_html()))
print(news)</code></pre>