抓取-百度
<h4>抓取来源-百度资讯</h4>
<p>新闻抓取,采用requests请求(<span style="color:red">注意反爬,定期更新一下cookie</span>),用lxml解析抓取,注意编码方式,网页来源较多,不建议采用UTF-8的编码方式</p>
<p>自动获取网页编码方式
<code>r.encoding = r.apparent_encoding</code>
<br>
Xpath提取方法</p>
<pre><code class="language-python"># 获取所有 li标签
xpath_items = '//div[@id="container"]/div[@id="content_left"]/div[@srcid="200"]'
# 对每个 li标签再提取
xpath_link = './div/h3/a[@target="_blank"]/@href'
xpath_title = './div/h3/a[@target="_blank"]/@aria-label'
xpath_comment_numsj = './div/div/div/span[1]/@aria-label'
xpath_comment_numhy = './div/div/div/div/a[@class="source-link_Ft1ov"]/span/text()'</code></pre>
<ul>
<li>
<p>pip install *</p>
<pre><code class="language-python">import re
import requests
from lxml import etree
from urllib import parse
import openpyxl as op
import pandas as pd
import time
from datetime import date</code></pre>
</li>
<li>正则判断新闻时间
<code>re.search('小时|今天|分钟', str(t['time']))</code></li>
<li>新闻标题进行去重
<code>frame.drop_duplicates(subset=['标题'], keep='first', inplace=True)</code></li>
<li>url拼接
href属性有两种方式,结对路径和相对路径,遇到这种情况必须要对url进行拼接
<code>parse.urljoin(host, url)</code></li>
<li>完整代码</li>
</ul>
<pre><code class="language-python">def bd():
# -*- coding: utf-8 -*
import json
import re
import requests
from lxml import etree
from urllib import parse
import openpyxl as op
import pandas as pd
import time
from datetime import date
today = date.today()
print("今天的日期:",today)
for u in range(2):
#计时
start_time = time.time()
sum = 0
jmax = 0
hy1 = ["长河禁捕","农业","林业","水利","环保","政法","应急","国土","园区","市政","能源","铁路","乡镇","电力","海洋","广告","运营商"]
# 临时汇总表
book = op.Workbook()
sheet = book.active
zb = r'C:\关键词.xlsx'
ak = []
anum = []
for hy in hy1:
lnum = 0
# 临时汇总表
ws = book.create_sheet(hy)
biaotou = (
('链接', '标题', '时间', '来源' , '关键词'),
)
for biaotou1 in biaotou:
ws.append(biaotou1)
ws.column_dimensions['A'].width = 80
ws.column_dimensions['B'].width = 80
ws.column_dimensions['C'].width = 20
ws.column_dimensions['D'].width = 20
zb1 = pd.read_excel(zb,sheet_name=hy,header=None)
print(str(hy) + str(zb1[0].tolist()))
hy_list = zb1[0].tolist()
for gjz in hy_list:
print(gjz)
name2 = str(gjz)
k = []
num = ["0","10","20"]
for page in num:
# 请求头和目标网址
headers = {
'Host': 'www.baidu.com',
'Connection': 'keep-alive',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
url = 'https://www.baidu.com/s?ie=utf-8&medium=0&rtt=4&bsst=1&rsv_dl=news_b_pn&cl=2&wd='+str(name2)+'&tn=news&rsv_bp=1&oq=&rsv_sug3=2&rsv_sug1=2&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&f=8&inputT=634&rsv_sug4=634&x_bfe_rqs=03E800000000000000004400000002&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&pn=' + str(page)
# 第二种写法的 xpath
# 获取所有 li标签
xpath_items = '//div[@id="container"]/div[@id="content_left"]/div[@srcid="200"]'
# 对每个 li标签再提取
xpath_link = './div/h3/a[@target="_blank"]/@href'
xpath_title = './div/h3/a[@target="_blank"]/@aria-label'
xpath_comment_numsj = './div/div/div/span[1]/@aria-label'
xpath_comment_numhy = './div/div/div/div/a[@class="source-link_Ft1ov"]/span/text()'
# 获取和解析网页
r = requests.get(url,headers=headers)
r.encoding = r.apparent_encoding
dom = etree.HTML(r.text)
# 获取所有的文章标签
items = dom.xpath(xpath_items)
# 分别对每一个文章标签进行操作 将每篇文章的链接 标题 评论数 点赞数放到一个字典里
data = []
for article in items:
t = {}
t['url'] = article.xpath(xpath_link)
t['title'] = article.xpath(xpath_title)
# comment_num对应的标签里有两个文本标签 用 join方法将两个文本拼接起来
# strip()方法去除换行和空格
t['time'] = ''.join(article.xpath(xpath_comment_numsj)).strip()
t['ly'] = ''.join(article.xpath(xpath_comment_numhy)).strip()
z1 = str(t).replace('"', '')
z2 = str(z1).replace('\\', '')
data.append(z2)
re1 = re.search('小时|今天|分钟', str(t['time']))
print(re1)
if str(re1) == "None":
continue
biaotou = (
(str(t['url'][0]), str(t['title'][0]), str(t['time']), str(t['ly']) , gjz),
)
for biaotou1 in biaotou:
ws.append(biaotou1)
book.save("C:\\铁塔每日新闻记录表\\百度资讯抓取\\"+str(today)+".xlsx")
zb1 = pd.read_excel("C:\\铁塔每日新闻记录表\\百度资讯抓取\\"+str(today)+".xlsx", sheet_name=hy, header=0)
max = len(zb1["关键词"].value_counts().tolist())
z1 = zb1["关键词"].value_counts().index.tolist()
z2 = zb1["关键词"].value_counts().tolist()
for i in range(max):
v1 = str(z1[i]) + " " + str(z2[i]) + "\n"
ak.append(v1)
jmax += int(z2[i])
lnum += int(z2[i])
anum.append(str(hy) + str(lnum) + "\n")
xlsx = pd.ExcelWriter("C:\\铁塔每日新闻记录表\\百度资讯抓取\\" + str(today) + "(1).xlsx")
ac = []
xmax = 0
for i in hy1:
frame = pd.read_excel("C:\\铁塔每日新闻记录表\\百度资讯抓取\\" + str(today) + ".xlsx", sheet_name=i, header=0)
frame.drop_duplicates(subset=['标题'], keep='first', inplace=True)
ac.append(str(i) + " " + str(len(frame["标题"].tolist())) + "\n")
max2 = len(frame["标题"].tolist())
xmax += max2
frame.to_excel(xlsx, sheet_name=i, index=None)
xlsx.close()
end_time = time.time()
dd = "\n耗时: {:.2f}秒".format(end_time - start_time)
kk1 = "".join(ak)
kk2 = "********************"
kk3 = "".join(ac)
import requests
import json
# python 3.8
import time
import hmac
import hashlib
import base64
import urllib.parse
timestamp = str(round(time.time() * 1000))
secret = 'SECf2862cf7df669f2583716d62632f47ff362b319f2c3fd1df3b865f49286c8941'
secret_enc = secret.encode('utf-8')
string_to_sign = '{}\n{}'.format(timestamp, secret)
string_to_sign_enc = string_to_sign.encode('utf-8')
hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
print(timestamp)
print(sign)
webhook = "https://oapi.dingtalk.com/robot/send?access_token=48d670f795e9f427ce128beeb7fb82e25b39eb7b810720c0220762d1e4cc757e&timestamp=" + str(
timestamp) + "&sign=" + str(sign)
headers = {'content-type': 'application/json'} # 请求头
data = {"msgtype": "text", "text": {"content": '✅百度资讯抓取完成' + dd + "\n" + kk1 + "\n今日抓取总量: " + str(jmax) + "\n" + kk2 + "\n" + kk3 + "\n今日筛选总量: " + str(xmax)},
"at": {"atMobiles": [""], "isAtAll": False}}
r = requests.post(webhook, headers=headers, data=json.dumps(data))
r.encoding = 'utf-8'
print(r.text)</code></pre>