铁塔运维手册


抓取

新闻抓取

ZF网站不能被大量怼,所以采用selenium静态方式读取,维护采集表,如果发现网页xpath变更,要及时修正

抓取方式

采用网页xpath采集方式进行每日的新闻抓取,因为数据更新慢,每次采集一页,获取当天日期新闻,写入采集表

if str(j) == str(datetime.date.today()).replace("-",""):

    new_full_url = parse.urljoin(url, c2)#转换url

    mydict = {"title": str(c1).replace(" ",""), "url": str(new_full_url), "time": str(datetime.date.today()), "laiyuan":str(laiyuan)}
    try:
        x = mycol1.insert_one(mydict)
    except:
        print("重复标题")

else:
    print("日期不否")

完整代码


# coding:utf-8
import json
import os
import re
import time

import requests
from lxml import etree
import pymongo
import datetime
import selenium.webdriver as wb
from urllib import parse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

hy = ["公安局","水利局","农业农村局","交通运输局","林业和草原局","生态环境局","应急管理局","自然资源局","司法部","乡村振兴局"]

today=str(datetime.date.today()).split("-")
MM = today[0]
YY = today[1]
DD = today[2]
print(MM + YY + DD)
myclient = pymongo.MongoClient('mongodb://192.168.1.185:27017/')

n1 = 0
n2 = 0
n3 = 0
n4 = 0
n5 = 0
n6 = 0
n7 = 0
n8 = 0

for ui in hy:
    dblist = myclient.list_database_names()
    mydb = myclient["网站采集"]
    mycol = mydb[ui]
    mydb1 = myclient["tieta2"]
    mycol1 = mydb1[ui]

    for x in mycol.find({}, {"_id": 0, "host":1, "name": 1,"url": 1,"xpath": 1,}):
        try:

            laiyuan = x["name"]
            name = x["name"]
            url = x["url"]
            print(laiyuan)
            chrome_options = wb.ChromeOptions()
            # # 后台静默运行
            chrome_options.headless = True

            s = Service(r'C:\Users\z\Desktop\z\chromedriver.exe')
            driver = webdriver.Chrome(service=s,options=chrome_options)
            driver.set_page_load_timeout(60)
            driver.get(url)

            page = driver.page_source

            tree=etree.HTML(page) # 使用xpath解析从网络上获取的数据

            u_title=tree.xpath(x["xpath"]["name_x"])   # 解析获取当页所有段子的标题
            u_url=tree.xpath(x["xpath"]["url_x"])   # 解析获取当页所有段子的标题
            u_time=tree.xpath(x["xpath"]["time_x"])   # 解析获取当页所有段子的标题

            if len(u_title) == len(u_url) == len(u_time):
                num = len(u_title)
                for i in range(num):
                    c1 = str(u_title[i]).replace("\n","")
                    c2 = str(u_url[i]).replace("\n","")
                    c3 = str(u_time[i]).replace("\n","")

                    print("\t" + str(c3).replace(" ", "") + "\ttitle:" + c1 + "\turl:" + c2)
                    str1 = c3
                    regex = "\d+"
                    matches = re.findall(regex, str1)
                    j = ''.join(matches)
                    if str(j) == str(datetime.date.today()).replace("-",""):

                        new_full_url = parse.urljoin(url, c2)#转换url

                        mydict = {"title": str(c1).replace(" ",""), "url": str(new_full_url), "time": str(datetime.date.today()), "laiyuan":str(laiyuan)}
                        try:
                            x = mycol1.insert_one(mydict)
                        except:
                            print("重复标题")

                    else:
                        print("日期不否")

            elif len(u_title) == 0:
                print("错误通知xpath抓取为空,请检查" + url)
                urlw = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=24e77854-27fa-4ddb-a229-3c3e0ccc2663"

                payload = {"msgtype": "text", "text": {"content": "错误通知xpath抓取为空,请检查" + url}}
                headers = {
                    'Content-Type': 'application/json'
                }

                response = requests.request("POST", urlw, headers=headers, data=json.dumps(payload))

                print(response.text)
            else:
                print("错误通知xpath不一致,请检查" + url)
                urlw = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=24e77854-27fa-4ddb-a229-3c3e0ccc2663"

                payload = {"msgtype": "text", "text": {"content": "错误通知xpath不一致,请检查" + url}}
                headers = {
                    'Content-Type': 'application/json'
                }

                response = requests.request("POST", urlw, headers=headers, data=json.dumps(payload))

                print(response.text)
            os.system("taskkill /F /IM chromedriver.exe")
            os.system("taskkill /F /IM chrome.exe")
        except:
            print(x["name"] + "掉线")

            urlw = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=24e77854-27fa-4ddb-a229-3c3e0ccc2663"

            payload = {"msgtype": "text", "text": {"content": x["name"] + "掉线," + "请获得新结果后,再次尝试。"}}
            headers = {
                'Content-Type': 'application/json'
            }

            response = requests.request("POST", urlw, headers=headers, data=json.dumps(payload))

            print(response.text)

页面列表

ITEM_HTML