28、爬凤凰军事新闻
<pre><code>from bs4 import BeautifulSoup
import urllib.request
import ssl
import time
import random
import xlsxwriter
import re
import json
import os
import pickle
import socket
import sys
from functools import partial
from multiprocessing import Pool
import pymongo
# from PyQt5.QtWidgets import QApplication, QWidget, QPushButton, QHBoxLayout, QTextEdit
# 设置超时时间为30s
socket.setdefaulttimeout(30)
ssl._create_default_https_context = ssl._create_unverified_context
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
# 返回html的soup解析
def openUrl(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
# 返回html的soup解析
def return_json(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
# Soup = BeautifulSoup(html, 'lxml')
return html
url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default/6516560185312428555/NaN/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback&_=15537801398081'
page_news = return_json(url)
result = re.search('\{\"code.*?(.*?).*?\}\}', page_news)
print(result.group())
json_news = json.loads(result.group())
news_data = []
final_news = []
for data in json_news['data']['newsstream']:
print(data['title'])
print(data['url'])
news_info = return_json(data['url'])
news_result = re.search('\{\"data.*?(.*?).*?\}\]', news_info)
data_news = news_result.group()
news_data.append(news_result.group())
final_text = {}
json_news_data = json.loads(data_news[:-1])
data_soup = BeautifulSoup(json_news_data['data'], 'lxml')
img_list = []
for k in data_soup.body.findAll('p'):
if k.img is not None:
img_list.append(k.img['src'])
print(k.img['src'])
print(data_soup.html.body.text)
final_text['commentUrl'] = data['commentUrl']
final_text['id'] = data['id']
final_text['newsTime'] = data['newsTime']
final_text['skey'] = data['skey']
final_text['source'] = data['source']
final_text['thumbnails'] = data['thumbnails']
final_text['thumbnailsCount'] = data['thumbnailsCount']
final_text['type'] = data['type']
final_text['url'] = data['url']
final_text['title'] = data['title']
final_text['imgs'] = img_list
final_text['text'] = data_soup.html.body.text
final_news.append(final_text)
# 建立与mongo的连接
client = pymongo.MongoClient(host='127.0.0.1',port=27017)
# 选中 或 新建fenghuang数据库
db = client.fenghuang
# 选中或新建news表
db_news = db.news
# 插入数据
for news in final_news:
result = db_news.insert_one(news)
print(result)
# 获取news表下所有数据
news_list = db_news.find()
for news in news_list:
print(news['title'])
'''
{'commentUrl': 'ucms_7lNzaiBBs7U',
'id': '6516577354955894784',
'newsTime': '2019-03-27 16:01:11',
'skey': 'f62ea9',
'source': '环球网',
'thumbnails': {'image': [{'height': 295,
'url': 'https://p3.ifengimg.com/2019_13/732B2DA47950920E40FF20D5690602584B45F5A0_w517_h295.jpg',
'width': 517}]},
'thumbnailsCount': '1',
'title': '莫迪:印度已成功试射反卫星导弹 击落卫星',
'type': 'article',
'url': 'https://mil.ifeng.com/c/7lNzaiBBs7U'}
'''</code></pre>