铁塔运维手册


抓取-百度

抓取来源-百度资讯

<p>新闻抓取,采用requests请求(<span style="color:red">注意反爬,定期更新一下cookie</span>),用lxml解析抓取,注意编码方式,网页来源较多,不建议采用UTF-8的编码方式</p> 自动获取网页编码方式 r.encoding = r.apparent_encoding <br> Xpath提取方法

# 获取所有 li标签
xpath_items = '//div[@id=&quot;container&quot;]/div[@id=&quot;content_left&quot;]/div[@srcid=&quot;200&quot;]'
# 对每个 li标签再提取
xpath_link = './div/h3/a[@target=&quot;_blank&quot;]/@href'
xpath_title = './div/h3/a[@target=&quot;_blank&quot;]/@aria-label'
xpath_comment_numsj = './div/div/div/span[1]/@aria-label'
xpath_comment_numhy = './div/div/div/div/a[@class=&quot;source-link_Ft1ov&quot;]/span/text()'
  • pip install *

    import re
    import requests
    from lxml import etree
    from urllib import parse
    import openpyxl as op
    import pandas as pd
    import time
    from datetime import date
  • 正则判断新闻时间 re.search('小时|今天|分钟', str(t['time']))
  • 新闻标题进行去重 frame.drop_duplicates(subset=['标题'], keep='first', inplace=True)
  • url拼接 href属性有两种方式,结对路径和相对路径,遇到这种情况必须要对url进行拼接 parse.urljoin(host, url)
  • 完整代码
def bd():
    # -*- coding: utf-8 -*
    import json
    import re
    import requests
    from lxml import etree
    from urllib import parse
    import openpyxl as op
    import pandas as pd
    import time

    from datetime import date
    today = date.today()
    print(&quot;今天的日期:&quot;,today)

    for u in range(2):
        #计时
        start_time = time.time()
        sum = 0
        jmax = 0

        hy1 = [&quot;长河禁捕&quot;,&quot;农业&quot;,&quot;林业&quot;,&quot;水利&quot;,&quot;环保&quot;,&quot;政法&quot;,&quot;应急&quot;,&quot;国土&quot;,&quot;园区&quot;,&quot;市政&quot;,&quot;能源&quot;,&quot;铁路&quot;,&quot;乡镇&quot;,&quot;电力&quot;,&quot;海洋&quot;,&quot;广告&quot;,&quot;运营商&quot;]

        # 临时汇总表
        book = op.Workbook()
        sheet = book.active

        zb = r'C:\关键词.xlsx'
        ak = []
        anum = []

        for hy in hy1:
            lnum = 0
            # 临时汇总表
            ws = book.create_sheet(hy)
            biaotou = (
                ('链接', '标题', '时间', '来源' , '关键词'),
            )
            for biaotou1 in biaotou:
                ws.append(biaotou1)
            ws.column_dimensions['A'].width = 80
            ws.column_dimensions['B'].width = 80
            ws.column_dimensions['C'].width = 20
            ws.column_dimensions['D'].width = 20
            zb1 = pd.read_excel(zb,sheet_name=hy,header=None)
            print(str(hy) + str(zb1[0].tolist()))
            hy_list = zb1[0].tolist()
            for gjz in hy_list:
                print(gjz)
                name2 = str(gjz)
                k = []
                num = [&quot;0&quot;,&quot;10&quot;,&quot;20&quot;]
                for page in num:
                    # 请求头和目标网址
                    headers = {
                        'Host': 'www.baidu.com',
                        'Connection': 'keep-alive',
                        'sec-ch-ua': '&quot; Not A;Brand&quot;;v=&quot;99&quot;, &quot;Chromium&quot;;v=&quot;101&quot;, &quot;Microsoft Edge&quot;;v=&quot;101&quot;',
                        'sec-ch-ua-mobile': '?0',
                        'sec-ch-ua-platform': '&quot;Windows&quot;',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                        'Sec-Fetch-Site': 'same-origin',
                        'Sec-Fetch-Mode': 'navigate',
                        'Sec-Fetch-User': '?1',
                        'Sec-Fetch-Dest': 'document',

                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
                    }
                    url = 'https://www.baidu.com/s?ie=utf-8&amp;medium=0&amp;rtt=4&amp;bsst=1&amp;rsv_dl=news_b_pn&amp;cl=2&amp;wd='+str(name2)+'&amp;tn=news&amp;rsv_bp=1&amp;oq=&amp;rsv_sug3=2&amp;rsv_sug1=2&amp;rsv_sug7=100&amp;rsv_sug2=0&amp;rsv_btype=t&amp;f=8&amp;inputT=634&amp;rsv_sug4=634&amp;x_bfe_rqs=03E800000000000000004400000002&amp;x_bfe_tjscore=0.100000&amp;tngroupname=organic_news&amp;newVideo=12&amp;goods_entry_switch=1&amp;pn=' + str(page)

                    # 第二种写法的 xpath
                    # 获取所有 li标签
                    xpath_items = '//div[@id=&quot;container&quot;]/div[@id=&quot;content_left&quot;]/div[@srcid=&quot;200&quot;]'
                    # 对每个 li标签再提取
                    xpath_link = './div/h3/a[@target=&quot;_blank&quot;]/@href'
                    xpath_title = './div/h3/a[@target=&quot;_blank&quot;]/@aria-label'
                    xpath_comment_numsj = './div/div/div/span[1]/@aria-label'
                    xpath_comment_numhy = './div/div/div/div/a[@class=&quot;source-link_Ft1ov&quot;]/span/text()'

                    # 获取和解析网页
                    r = requests.get(url,headers=headers)
                    r.encoding = r.apparent_encoding
                    dom = etree.HTML(r.text)

                    # 获取所有的文章标签
                    items = dom.xpath(xpath_items)

                    # 分别对每一个文章标签进行操作 将每篇文章的链接 标题 评论数 点赞数放到一个字典里
                    data = []
                    for article in items:
                        t = {}
                        t['url'] = article.xpath(xpath_link)
                        t['title'] = article.xpath(xpath_title)
                        # comment_num对应的标签里有两个文本标签 用 join方法将两个文本拼接起来
                        # strip()方法去除换行和空格
                        t['time'] = ''.join(article.xpath(xpath_comment_numsj)).strip()
                        t['ly'] = ''.join(article.xpath(xpath_comment_numhy)).strip()

                        z1 = str(t).replace('&quot;', '')
                        z2 = str(z1).replace('\\', '')
                        data.append(z2)

                        re1 = re.search('小时|今天|分钟', str(t['time']))
                        print(re1)
                        if str(re1) == &quot;None&quot;:
                            continue
                        biaotou = (
                            (str(t['url'][0]), str(t['title'][0]), str(t['time']), str(t['ly']) , gjz),
                        )
                        for biaotou1 in biaotou:
                            ws.append(biaotou1)

                    book.save(&quot;C:\\铁塔每日新闻记录表\\百度资讯抓取\\&quot;+str(today)+&quot;.xlsx&quot;)

            zb1 = pd.read_excel(&quot;C:\\铁塔每日新闻记录表\\百度资讯抓取\\&quot;+str(today)+&quot;.xlsx&quot;, sheet_name=hy, header=0)
            max = len(zb1[&quot;关键词&quot;].value_counts().tolist())

            z1 = zb1[&quot;关键词&quot;].value_counts().index.tolist()
            z2 = zb1[&quot;关键词&quot;].value_counts().tolist()

            for i in range(max):
                v1 = str(z1[i]) + &quot;  &quot; + str(z2[i]) + &quot;\n&quot;

                ak.append(v1)
                jmax += int(z2[i])
                lnum += int(z2[i])
            anum.append(str(hy) + str(lnum) + &quot;\n&quot;)

        xlsx = pd.ExcelWriter(&quot;C:\\铁塔每日新闻记录表\\百度资讯抓取\\&quot; + str(today) + &quot;(1).xlsx&quot;)
        ac = []
        xmax = 0
        for i in hy1:
            frame = pd.read_excel(&quot;C:\\铁塔每日新闻记录表\\百度资讯抓取\\&quot; + str(today) + &quot;.xlsx&quot;, sheet_name=i, header=0)
            frame.drop_duplicates(subset=['标题'], keep='first', inplace=True)
            ac.append(str(i) + &quot;  &quot; + str(len(frame[&quot;标题&quot;].tolist())) + &quot;\n&quot;)
            max2 = len(frame[&quot;标题&quot;].tolist())
            xmax += max2
            frame.to_excel(xlsx, sheet_name=i, index=None)

        xlsx.close()

        end_time = time.time()
        dd = &quot;\n耗时: {:.2f}秒&quot;.format(end_time - start_time)
        kk1 = &quot;&quot;.join(ak)
        kk2 = &quot;********************&quot;
        kk3 = &quot;&quot;.join(ac)
    import requests
    import json
    # python 3.8
    import time
    import hmac
    import hashlib
    import base64
    import urllib.parse
    timestamp = str(round(time.time() * 1000))
    secret = 'SECf2862cf7df669f2583716d62632f47ff362b319f2c3fd1df3b865f49286c8941'
    secret_enc = secret.encode('utf-8')
    string_to_sign = '{}\n{}'.format(timestamp, secret)
    string_to_sign_enc = string_to_sign.encode('utf-8')
    hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
    sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
    print(timestamp)
    print(sign)
    webhook = &quot;https://oapi.dingtalk.com/robot/send?access_token=48d670f795e9f427ce128beeb7fb82e25b39eb7b810720c0220762d1e4cc757e&amp;timestamp=&quot; + str(
        timestamp) + &quot;&amp;sign=&quot; + str(sign)
    headers = {'content-type': 'application/json'}  # 请求头
    data = {&quot;msgtype&quot;: &quot;text&quot;, &quot;text&quot;: {&quot;content&quot;: '✅百度资讯抓取完成' + dd + &quot;\n&quot; + kk1 + &quot;\n今日抓取总量:  &quot; + str(jmax) + &quot;\n&quot; + kk2 + &quot;\n&quot; + kk3 + &quot;\n今日筛选总量:  &quot; + str(xmax)},
            &quot;at&quot;: {&quot;atMobiles&quot;: [&quot;&quot;], &quot;isAtAll&quot;: False}}

    r = requests.post(webhook, headers=headers, data=json.dumps(data))
    r.encoding = 'utf-8'
    print(r.text)

页面列表

ITEM_HTML