python


28、爬凤凰军事新闻

<pre><code>from bs4 import BeautifulSoup import urllib.request import ssl import time import random import xlsxwriter import re import json import os import pickle import socket import sys from functools import partial from multiprocessing import Pool import pymongo # from PyQt5.QtWidgets import QApplication, QWidget, QPushButton, QHBoxLayout, QTextEdit # 设置超时时间为30s socket.setdefaulttimeout(30) ssl._create_default_https_context = ssl._create_unverified_context def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False # 返回html的soup解析 def openUrl(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) Soup = BeautifulSoup(html, 'lxml') return Soup # 返回html的soup解析 def return_json(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) # Soup = BeautifulSoup(html, 'lxml') return html url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6516560185312428555/NaN/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback&amp;_=15537801398081' page_news = return_json(url) result = re.search('\{\"code.*?(.*?).*?\}\}', page_news) print(result.group()) json_news = json.loads(result.group()) news_data = [] final_news = [] for data in json_news['data']['newsstream']: print(data['title']) print(data['url']) news_info = return_json(data['url']) news_result = re.search('\{\"data.*?(.*?).*?\}\]', news_info) data_news = news_result.group() news_data.append(news_result.group()) final_text = {} json_news_data = json.loads(data_news[:-1]) data_soup = BeautifulSoup(json_news_data['data'], 'lxml') img_list = [] for k in data_soup.body.findAll('p'): if k.img is not None: img_list.append(k.img['src']) print(k.img['src']) print(data_soup.html.body.text) final_text['commentUrl'] = data['commentUrl'] final_text['id'] = data['id'] final_text['newsTime'] = data['newsTime'] final_text['skey'] = data['skey'] final_text['source'] = data['source'] final_text['thumbnails'] = data['thumbnails'] final_text['thumbnailsCount'] = data['thumbnailsCount'] final_text['type'] = data['type'] final_text['url'] = data['url'] final_text['title'] = data['title'] final_text['imgs'] = img_list final_text['text'] = data_soup.html.body.text final_news.append(final_text) # 建立与mongo的连接 client = pymongo.MongoClient(host='127.0.0.1',port=27017) # 选中 或 新建fenghuang数据库 db = client.fenghuang # 选中或新建news表 db_news = db.news # 插入数据 for news in final_news: result = db_news.insert_one(news) print(result) # 获取news表下所有数据 news_list = db_news.find() for news in news_list: print(news['title']) ''' {'commentUrl': 'ucms_7lNzaiBBs7U', 'id': '6516577354955894784', 'newsTime': '2019-03-27 16:01:11', 'skey': 'f62ea9', 'source': '环球网', 'thumbnails': {'image': [{'height': 295, 'url': 'https://p3.ifengimg.com/2019_13/732B2DA47950920E40FF20D5690602584B45F5A0_w517_h295.jpg', 'width': 517}]}, 'thumbnailsCount': '1', 'title': '莫迪:印度已成功试射反卫星导弹 击落卫星', 'type': 'article', 'url': 'https://mil.ifeng.com/c/7lNzaiBBs7U'} '''</code></pre>

页面列表

ITEM_HTML