抓取-腾讯
抓取来源-百度资讯
新闻抓取,采用ios抓包和requests请求(注意反爬,定期更新一下cookie),解析抓取,注意编码方式,网页来源较多,不建议采用UTF-8的编码方式
环境工具
charles(抓包工具) 、 java 、 python
抓取来源腾讯新闻app,pc端搜索引擎关闭,只能通过移动端ios抓包获取接口
<img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=88afe8d7429345fbab1e670ad232a0e6&file=file.png" width="300">
Tieta_main1
--main_1 #行业并发
--Crawler #抓取
Tieta_main2
--main_2 #行业并发
--download #下载
--h5 #生成html模板
--moban #html标签代码
完整代码
package zpw.tieta;
import java.text.SimpleDateFormat;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.mongodb.MongoClient;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import kong.unirest.HttpResponse;
import kong.unirest.Unirest;
import org.bson.Document;
import org.json.simple.parser.ParseException;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.json.simple.parser.JSONParser;
public class Crawler {
private String name; //私有化值-行业名-库表
public void pcsetName(String name) {
this.name = name;
}
public String pcgetName() {
//入参
JSONArray newsLis = JSONArray.fromObject(name);
Object dbk = newsLis.get(0);
String dbk1 = dbk.toString();
Object hhy = newsLis.get(1);
String hhy1 = hhy.toString();
try {
//读取本地sql配置文件,解析JSON文件的内容
JSONParser jsonParser = new JSONParser();
org.json.simple.JSONObject jsonObject = (org.json.simple.JSONObject) jsonParser.parse(new FileReader("C:\\sql.json"));
net.sf.json.JSONObject jsonObject1 = net.sf.json.JSONObject.fromObject(jsonObject);
String mongo = jsonObject1.getString("mongodb");
//主机名
net.sf.json.JSONObject host = net.sf.json.JSONObject.fromObject(mongo);
String host_ip = host.getString("host");
//端口
net.sf.json.JSONObject port1 = net.sf.json.JSONObject.fromObject(mongo);
String port = port1.getString("port");
//数据库
net.sf.json.JSONObject db = net.sf.json.JSONObject.fromObject(mongo);
String data = db.getString("db");
// host:主机名
// port:指定端口
// username:数据库用户名
// password:数据库密码
// 连接到 mongodb 服务
MongoClient mongoClient = new MongoClient(host_ip, Integer.parseInt(port));
// 连接到数据库
MongoDatabase mongoDatabase = mongoClient.getDatabase(data);
MongoCollection<Document> collection1 = mongoDatabase.getCollection("gjz");
FindIterable<Document> documentss = collection1.find();
MongoCursor<Document> mongoCursor = documentss.iterator();
while (mongoCursor.hasNext()) {
String js = mongoCursor.next().toJson();
System.out.println(js);
Object documentc = Configuration.defaultConfiguration().jsonProvider().parse(js);
String hhyy = JsonPath.read(documentc, "$.hy");
System.out.println(hhy);
String name1 = JsonPath.read(documentc, "$.name1");
String name2 = JsonPath.read(documentc, "$.name2");
String name3 = JsonPath.read(documentc, "$.name3");
String aa11 = hhy1;
boolean status = aa11.contains(hhyy);
if (status) {
List<String> list = new ArrayList<String>();
list.add(name1);
list.add(name2);
list.add(name3);
for (Object oo : list) {
MongoCollection<Document> collection = mongoDatabase.getCollection(dbk1);
//爬取15页
for (int i = 0; i <= 15; i++) {
try {
//来源腾讯新闻app,搜索功能h5未开放,charles或fiddler抓取
HttpResponse<String> response = Unirest.post("https://r.inews.qq.com/gw/search/result?isJailbreak=0&currentSetId=group_news2&qn-rid=1011_24DC018C-7ACB-4162-A99C-84AE9B244A98&device_model=iPad11%2C6&deviceToken=a72e67ec8dbdd7365979e47ed2ced323007d1050cc6ead4b74bbce4525ae2357&device_appin=FCAA47DA-4A3D-4FE0-8D3E-4941D12F8984&currentTabId=news_news&visit_mode=0&isMainUserLogin=0&qqnews_refpage=QNHippyChannelViewController&__qnr=287952b912d5&qn-time=1657346873401&qn-sig=DE481981E89BB739272BEEB0C4F45A8C&network_type=wifi&cookie=logintype%3D2%3Bsuid%3D8gMc3n5c5YUZvDja5gtx&startTimestamp=1657346835&hw=iPad11%2C6&q36_create_time=1656172800&page_type=other&new_town=0&create_time=1656212040&omgbizid=%20&screen_height=667&trueVersion=6.8.80&global_session_id=1657114164331&omgid=%20&idfa=00000000-0000-0000-0000-000000000000&user_vip_type=0&qn-newsig=2264fbea226389e4ff7022b1bbac032bd6e379dd891944563ff75d3382063b31&preStartTimestamp=1657346776&screen_scale=2&pagestartfrom=icon&appver=15.5_qqnews_6.8.80&store=1&screen_width=375&devid=4E6B2A1F-791E-43EF-87BF-5EF95903B650&QIMEI36=0c12107781e83d0d5ece53d0000016b15a1f&net_ssid=&activefrom=icon&apptype=ios&suid=8gMc3n5c5YUZvDja5gtx&httpRequestUid=287952b9d536")
.header("referer", "http://inews.qq.com/inews/iphone/")
.header("content-type", "application/x-www-form-urlencoded")
.body("transparam=%7B%22sessionid%22%3A%2203dd55bdb70868bc47c81ca33e05576d%22%7D&hippy_res_ver=hippy_memory_B%3A%28null%29%2Chippy_react_vendor%3A%28null%29%2Chippy_audio%3A%28null%29%2Chippy_vendor%3A680014%2Chippy_list%3A%28null%29%2Chippy_memory%3A%28null%29%2Chippy_cell%3A683626%2Chippy_memory_A%3A%28null%29%2Chippy_negative_screen%3A684033&hippy_search=1&hippy_custom_version=6&query=" + oo + "&search_type=pictext&type=0&currentChannelId=_qqnews_custom_search_pictext&page=" + i + "&global_info=1%7C054d000200fec352cc4d8614bb2b8bf8%7C%7C0%7C0%7C13%7C7%7C1%7C0%7C6%7C1%7C1%7C0%7C%7C5%7C%7C1429%7C0%7C1%7C6%7C2%7C4%7C4%7C0%7C%7C3%7C3%7C1%7C1%7C1%7C1%7C1%7C1%7C-1%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C3%7C0%7C0%7C1%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C1%7C0%7C1%7C1%7C1%7C0%7C1%7C0%7C0%7C0%7C1%7C0%7C11%7C20%7C1%7C0%7C1%7C0%7C0%7C0%7C1%7C4%7C0%7C1%7C1%7C41%7C0%7C51%7C60%7C0%7C1%7C0%7C0%7C1%7C0%7C1%7C0%7C0%7C71%7C0%7C0%7C1%7C71")
.asString();
String a1 = response.getBody();
//System.out.println(a1);
JSONObject jsonObject0 = JSONObject.fromObject(a1);
String error = jsonObject0.getString("secList");
System.out.println("新闻数量:" + error);
//元素数量
Object sc = error;
JSONArray secList = JSONArray.fromObject(sc);
System.out.println(secList.size());
//循环列表
for (Object sc1 : secList) {
//System.out.println(sc1);
JSONObject xw = JSONObject.fromObject(sc1);
Object xw_1 = xw.getString("newsList");
Object newsList = xw_1;
JSONArray newsList1 = JSONArray.fromObject(newsList);
Object newsList2 = newsList1.get(0);
JSONObject title1 = JSONObject.fromObject(newsList2);
//System.out.println(title1);
Object title = title1.getString("title");
Object time = title1.getString("time");
Object url = title1.getString("url");
Object laiyuan = title1.getString("chlname");
System.out.println("时间:" + time + "\t标题:" + title + "链接:" + url + "来源:" + laiyuan);
//转换今日日期
Date dt = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String s = sdf.format(dt); /** 类型转换**/
//判断包含
String aa = time.toString();
boolean status1 = aa.contains(s);
if (status1) {
System.out.println("今日新闻");
/*写库*/
try {
Document document = new Document("title", title).
append("time", aa.split(" ")[0]).
append("laiyuan", laiyuan).
append("url", url);
List<Document> documents = new ArrayList<Document>();
documents.add(document);
collection.insertMany(documents);
} catch (Exception e) {
System.out.println("重复标题,写入失败");
}
} else {
System.out.println("新闻日期不否,跳过");
}
}
} catch (Exception e) {
System.out.println("被反爬");
}
}
}
break;
} else {
continue;
}
}
this.name = name;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return this.name;
}
}