铁塔运维手册


抓取-百度

<h4>抓取来源-百度资讯</h4> <p>新闻抓取,采用requests请求(<span style="color:red">注意反爬,定期更新一下cookie</span>),用lxml解析抓取,注意编码方式,网页来源较多,不建议采用UTF-8的编码方式</p> <p>自动获取网页编码方式 <code>r.encoding = r.apparent_encoding</code> <br> Xpath提取方法</p> <pre><code class="language-python"># 获取所有 li标签 xpath_items = '//div[@id="container"]/div[@id="content_left"]/div[@srcid="200"]' # 对每个 li标签再提取 xpath_link = './div/h3/a[@target="_blank"]/@href' xpath_title = './div/h3/a[@target="_blank"]/@aria-label' xpath_comment_numsj = './div/div/div/span[1]/@aria-label' xpath_comment_numhy = './div/div/div/div/a[@class="source-link_Ft1ov"]/span/text()'</code></pre> <ul> <li> <p>pip install *</p> <pre><code class="language-python">import re import requests from lxml import etree from urllib import parse import openpyxl as op import pandas as pd import time from datetime import date</code></pre> </li> <li>正则判断新闻时间 <code>re.search('小时|今天|分钟', str(t['time']))</code></li> <li>新闻标题进行去重 <code>frame.drop_duplicates(subset=['标题'], keep='first', inplace=True)</code></li> <li>url拼接 href属性有两种方式,结对路径和相对路径,遇到这种情况必须要对url进行拼接 <code>parse.urljoin(host, url)</code></li> <li>完整代码</li> </ul> <pre><code class="language-python">def bd(): # -*- coding: utf-8 -* import json import re import requests from lxml import etree from urllib import parse import openpyxl as op import pandas as pd import time from datetime import date today = date.today() print("今天的日期:",today) for u in range(2): #计时 start_time = time.time() sum = 0 jmax = 0 hy1 = ["长河禁捕","农业","林业","水利","环保","政法","应急","国土","园区","市政","能源","铁路","乡镇","电力","海洋","广告","运营商"] # 临时汇总表 book = op.Workbook() sheet = book.active zb = r'C:\关键词.xlsx' ak = [] anum = [] for hy in hy1: lnum = 0 # 临时汇总表 ws = book.create_sheet(hy) biaotou = ( ('链接', '标题', '时间', '来源' , '关键词'), ) for biaotou1 in biaotou: ws.append(biaotou1) ws.column_dimensions['A'].width = 80 ws.column_dimensions['B'].width = 80 ws.column_dimensions['C'].width = 20 ws.column_dimensions['D'].width = 20 zb1 = pd.read_excel(zb,sheet_name=hy,header=None) print(str(hy) + str(zb1[0].tolist())) hy_list = zb1[0].tolist() for gjz in hy_list: print(gjz) name2 = str(gjz) k = [] num = ["0","10","20"] for page in num: # 请求头和目标网址 headers = { 'Host': 'www.baidu.com', 'Connection': 'keep-alive', 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6' } url = 'https://www.baidu.com/s?ie=utf-8&amp;medium=0&amp;rtt=4&amp;bsst=1&amp;rsv_dl=news_b_pn&amp;cl=2&amp;wd='+str(name2)+'&amp;tn=news&amp;rsv_bp=1&amp;oq=&amp;rsv_sug3=2&amp;rsv_sug1=2&amp;rsv_sug7=100&amp;rsv_sug2=0&amp;rsv_btype=t&amp;f=8&amp;inputT=634&amp;rsv_sug4=634&amp;x_bfe_rqs=03E800000000000000004400000002&amp;x_bfe_tjscore=0.100000&amp;tngroupname=organic_news&amp;newVideo=12&amp;goods_entry_switch=1&amp;pn=' + str(page) # 第二种写法的 xpath # 获取所有 li标签 xpath_items = '//div[@id="container"]/div[@id="content_left"]/div[@srcid="200"]' # 对每个 li标签再提取 xpath_link = './div/h3/a[@target="_blank"]/@href' xpath_title = './div/h3/a[@target="_blank"]/@aria-label' xpath_comment_numsj = './div/div/div/span[1]/@aria-label' xpath_comment_numhy = './div/div/div/div/a[@class="source-link_Ft1ov"]/span/text()' # 获取和解析网页 r = requests.get(url,headers=headers) r.encoding = r.apparent_encoding dom = etree.HTML(r.text) # 获取所有的文章标签 items = dom.xpath(xpath_items) # 分别对每一个文章标签进行操作 将每篇文章的链接 标题 评论数 点赞数放到一个字典里 data = [] for article in items: t = {} t['url'] = article.xpath(xpath_link) t['title'] = article.xpath(xpath_title) # comment_num对应的标签里有两个文本标签 用 join方法将两个文本拼接起来 # strip()方法去除换行和空格 t['time'] = ''.join(article.xpath(xpath_comment_numsj)).strip() t['ly'] = ''.join(article.xpath(xpath_comment_numhy)).strip() z1 = str(t).replace('"', '') z2 = str(z1).replace('\\', '') data.append(z2) re1 = re.search('小时|今天|分钟', str(t['time'])) print(re1) if str(re1) == "None": continue biaotou = ( (str(t['url'][0]), str(t['title'][0]), str(t['time']), str(t['ly']) , gjz), ) for biaotou1 in biaotou: ws.append(biaotou1) book.save("C:\\铁塔每日新闻记录表\\百度资讯抓取\\"+str(today)+".xlsx") zb1 = pd.read_excel("C:\\铁塔每日新闻记录表\\百度资讯抓取\\"+str(today)+".xlsx", sheet_name=hy, header=0) max = len(zb1["关键词"].value_counts().tolist()) z1 = zb1["关键词"].value_counts().index.tolist() z2 = zb1["关键词"].value_counts().tolist() for i in range(max): v1 = str(z1[i]) + " " + str(z2[i]) + "\n" ak.append(v1) jmax += int(z2[i]) lnum += int(z2[i]) anum.append(str(hy) + str(lnum) + "\n") xlsx = pd.ExcelWriter("C:\\铁塔每日新闻记录表\\百度资讯抓取\\" + str(today) + "(1).xlsx") ac = [] xmax = 0 for i in hy1: frame = pd.read_excel("C:\\铁塔每日新闻记录表\\百度资讯抓取\\" + str(today) + ".xlsx", sheet_name=i, header=0) frame.drop_duplicates(subset=['标题'], keep='first', inplace=True) ac.append(str(i) + " " + str(len(frame["标题"].tolist())) + "\n") max2 = len(frame["标题"].tolist()) xmax += max2 frame.to_excel(xlsx, sheet_name=i, index=None) xlsx.close() end_time = time.time() dd = "\n耗时: {:.2f}秒".format(end_time - start_time) kk1 = "".join(ak) kk2 = "********************" kk3 = "".join(ac) import requests import json # python 3.8 import time import hmac import hashlib import base64 import urllib.parse timestamp = str(round(time.time() * 1000)) secret = 'SECf2862cf7df669f2583716d62632f47ff362b319f2c3fd1df3b865f49286c8941' secret_enc = secret.encode('utf-8') string_to_sign = '{}\n{}'.format(timestamp, secret) string_to_sign_enc = string_to_sign.encode('utf-8') hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest() sign = urllib.parse.quote_plus(base64.b64encode(hmac_code)) print(timestamp) print(sign) webhook = "https://oapi.dingtalk.com/robot/send?access_token=48d670f795e9f427ce128beeb7fb82e25b39eb7b810720c0220762d1e4cc757e&amp;timestamp=" + str( timestamp) + "&amp;sign=" + str(sign) headers = {'content-type': 'application/json'} # 请求头 data = {"msgtype": "text", "text": {"content": '✅百度资讯抓取完成' + dd + "\n" + kk1 + "\n今日抓取总量: " + str(jmax) + "\n" + kk2 + "\n" + kk3 + "\n今日筛选总量: " + str(xmax)}, "at": {"atMobiles": [""], "isAtAll": False}} r = requests.post(webhook, headers=headers, data=json.dumps(data)) r.encoding = 'utf-8' print(r.text)</code></pre>

页面列表

ITEM_HTML