铁塔运维手册


抓取-腾讯

抓取来源-百度资讯

新闻抓取,采用ios抓包和requests请求(注意反爬,定期更新一下cookie),解析抓取,注意编码方式,网页来源较多,不建议采用UTF-8的编码方式

环境工具

charles(抓包工具) 、 java 、 python

抓取来源腾讯新闻app,pc端搜索引擎关闭,只能通过移动端ios抓包获取接口

<img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=88afe8d7429345fbab1e670ad232a0e6&file=file.png"; width="300">


Tieta_main1
    --main_1        #行业并发
        --Crawler   #抓取

Tieta_main2
    --main_2            #行业并发
        --download      #下载
        --h5            #生成html模板
        --moban         #html标签代码

完整代码

package zpw.tieta;
import java.text.SimpleDateFormat;

import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.mongodb.MongoClient;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import kong.unirest.HttpResponse;
import kong.unirest.Unirest;
import org.bson.Document;
import org.json.simple.parser.ParseException;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.json.simple.parser.JSONParser;

public class Crawler {
    private String name; //私有化值-行业名-库表

    public void pcsetName(String name) {
        this.name = name;

    }

    public String pcgetName() {
        //入参
        JSONArray newsLis = JSONArray.fromObject(name);
        Object dbk = newsLis.get(0);
        String dbk1 = dbk.toString();
        Object hhy = newsLis.get(1);
        String hhy1 = hhy.toString();

        try {
            //读取本地sql配置文件,解析JSON文件的内容
            JSONParser jsonParser = new JSONParser();
            org.json.simple.JSONObject jsonObject = (org.json.simple.JSONObject) jsonParser.parse(new FileReader(&quot;C:\\sql.json&quot;));

            net.sf.json.JSONObject jsonObject1 = net.sf.json.JSONObject.fromObject(jsonObject);
            String mongo = jsonObject1.getString(&quot;mongodb&quot;);
            //主机名
            net.sf.json.JSONObject host = net.sf.json.JSONObject.fromObject(mongo);
            String host_ip = host.getString(&quot;host&quot;);
            //端口
            net.sf.json.JSONObject port1 = net.sf.json.JSONObject.fromObject(mongo);
            String port = port1.getString(&quot;port&quot;);
            //数据库
            net.sf.json.JSONObject db = net.sf.json.JSONObject.fromObject(mongo);
            String data = db.getString(&quot;db&quot;);

            //    host:主机名
            //    port:指定端口
            //    username:数据库用户名
            //    password:数据库密码

            // 连接到 mongodb 服务
            MongoClient mongoClient = new MongoClient(host_ip, Integer.parseInt(port));

            // 连接到数据库
            MongoDatabase mongoDatabase = mongoClient.getDatabase(data);

            MongoCollection&lt;Document&gt; collection1 = mongoDatabase.getCollection(&quot;gjz&quot;);

            FindIterable&lt;Document&gt; documentss = collection1.find();
            MongoCursor&lt;Document&gt; mongoCursor = documentss.iterator();

            while (mongoCursor.hasNext()) {
                String js = mongoCursor.next().toJson();
                System.out.println(js);
                Object documentc = Configuration.defaultConfiguration().jsonProvider().parse(js);
                String hhyy = JsonPath.read(documentc, &quot;$.hy&quot;);
                System.out.println(hhy);
                String name1 = JsonPath.read(documentc, &quot;$.name1&quot;);
                String name2 = JsonPath.read(documentc, &quot;$.name2&quot;);
                String name3 = JsonPath.read(documentc, &quot;$.name3&quot;);

                String aa11 = hhy1;
                boolean status = aa11.contains(hhyy);

                if (status) {
                    List&lt;String&gt; list = new ArrayList&lt;String&gt;();
                    list.add(name1);
                    list.add(name2);
                    list.add(name3);
                    for (Object oo : list) {
                        MongoCollection&lt;Document&gt; collection = mongoDatabase.getCollection(dbk1);

                        //爬取15页
                        for (int i = 0; i &lt;= 15; i++) {
                            try {

                                //来源腾讯新闻app,搜索功能h5未开放,charles或fiddler抓取
                                HttpResponse&lt;String&gt; response = Unirest.post(&quot;https://r.inews.qq.com/gw/search/result?isJailbreak=0&amp;currentSetId=group_news2&amp;qn-rid=1011_24DC018C-7ACB-4162-A99C-84AE9B244A98&amp;device_model=iPad11%2C6&amp;deviceToken=a72e67ec8dbdd7365979e47ed2ced323007d1050cc6ead4b74bbce4525ae2357&amp;device_appin=FCAA47DA-4A3D-4FE0-8D3E-4941D12F8984&amp;currentTabId=news_news&amp;visit_mode=0&amp;isMainUserLogin=0&amp;qqnews_refpage=QNHippyChannelViewController&amp;__qnr=287952b912d5&amp;qn-time=1657346873401&amp;qn-sig=DE481981E89BB739272BEEB0C4F45A8C&amp;network_type=wifi&amp;cookie=logintype%3D2%3Bsuid%3D8gMc3n5c5YUZvDja5gtx&amp;startTimestamp=1657346835&amp;hw=iPad11%2C6&amp;q36_create_time=1656172800&amp;page_type=other&amp;new_town=0&amp;create_time=1656212040&amp;omgbizid=%20&amp;screen_height=667&amp;trueVersion=6.8.80&amp;global_session_id=1657114164331&amp;omgid=%20&amp;idfa=00000000-0000-0000-0000-000000000000&amp;user_vip_type=0&amp;qn-newsig=2264fbea226389e4ff7022b1bbac032bd6e379dd891944563ff75d3382063b31&amp;preStartTimestamp=1657346776&amp;screen_scale=2&amp;pagestartfrom=icon&amp;appver=15.5_qqnews_6.8.80&amp;store=1&amp;screen_width=375&amp;devid=4E6B2A1F-791E-43EF-87BF-5EF95903B650&amp;QIMEI36=0c12107781e83d0d5ece53d0000016b15a1f&amp;net_ssid=&amp;activefrom=icon&amp;apptype=ios&amp;suid=8gMc3n5c5YUZvDja5gtx&amp;httpRequestUid=287952b9d536&quot;)
                                        .header(&quot;referer&quot;, &quot;http://inews.qq.com/inews/iphone/&quot;)
                                        .header(&quot;content-type&quot;, &quot;application/x-www-form-urlencoded&quot;)
                                        .body(&quot;transparam=%7B%22sessionid%22%3A%2203dd55bdb70868bc47c81ca33e05576d%22%7D&amp;hippy_res_ver=hippy_memory_B%3A%28null%29%2Chippy_react_vendor%3A%28null%29%2Chippy_audio%3A%28null%29%2Chippy_vendor%3A680014%2Chippy_list%3A%28null%29%2Chippy_memory%3A%28null%29%2Chippy_cell%3A683626%2Chippy_memory_A%3A%28null%29%2Chippy_negative_screen%3A684033&amp;hippy_search=1&amp;hippy_custom_version=6&amp;query=&quot; + oo + &quot;&amp;search_type=pictext&amp;type=0&amp;currentChannelId=_qqnews_custom_search_pictext&amp;page=&quot; + i + &quot;&amp;global_info=1%7C054d000200fec352cc4d8614bb2b8bf8%7C%7C0%7C0%7C13%7C7%7C1%7C0%7C6%7C1%7C1%7C0%7C%7C5%7C%7C1429%7C0%7C1%7C6%7C2%7C4%7C4%7C0%7C%7C3%7C3%7C1%7C1%7C1%7C1%7C1%7C1%7C-1%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C3%7C0%7C0%7C1%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C1%7C0%7C1%7C1%7C1%7C0%7C1%7C0%7C0%7C0%7C1%7C0%7C11%7C20%7C1%7C0%7C1%7C0%7C0%7C0%7C1%7C4%7C0%7C1%7C1%7C41%7C0%7C51%7C60%7C0%7C1%7C0%7C0%7C1%7C0%7C1%7C0%7C0%7C71%7C0%7C0%7C1%7C71&quot;)
                                        .asString();

                                String a1 = response.getBody();
                                //System.out.println(a1);
                                JSONObject jsonObject0 = JSONObject.fromObject(a1);
                                String error = jsonObject0.getString(&quot;secList&quot;);
                                System.out.println(&quot;新闻数量:&quot; + error);
                                //元素数量
                                Object sc = error;
                                JSONArray secList = JSONArray.fromObject(sc);
                                System.out.println(secList.size());

                                //循环列表
                                for (Object sc1 : secList) {
                                    //System.out.println(sc1);
                                    JSONObject xw = JSONObject.fromObject(sc1);
                                    Object xw_1 = xw.getString(&quot;newsList&quot;);
                                    Object newsList = xw_1;
                                    JSONArray newsList1 = JSONArray.fromObject(newsList);
                                    Object newsList2 = newsList1.get(0);
                                    JSONObject title1 = JSONObject.fromObject(newsList2);
                                    //System.out.println(title1);
                                    Object title = title1.getString(&quot;title&quot;);
                                    Object time = title1.getString(&quot;time&quot;);
                                    Object url = title1.getString(&quot;url&quot;);
                                    Object laiyuan = title1.getString(&quot;chlname&quot;);

                                    System.out.println(&quot;时间:&quot; + time + &quot;\t标题:&quot; + title + &quot;链接:&quot; + url + &quot;来源:&quot; + laiyuan);

                                    //转换今日日期
                                    Date dt = new Date();
                                    SimpleDateFormat sdf = new SimpleDateFormat(&quot;yyyy-MM-dd&quot;);
                                    String s = sdf.format(dt); /** 类型转换**/

                                    //判断包含
                                    String aa = time.toString();
                                    boolean status1 = aa.contains(s);

                                    if (status1) {
                                        System.out.println(&quot;今日新闻&quot;);

                                        /*写库*/
                                        try {
                                            Document document = new Document(&quot;title&quot;, title).
                                                    append(&quot;time&quot;, aa.split(&quot; &quot;)[0]).
                                                    append(&quot;laiyuan&quot;, laiyuan).
                                                    append(&quot;url&quot;, url);
                                            List&lt;Document&gt; documents = new ArrayList&lt;Document&gt;();
                                            documents.add(document);
                                            collection.insertMany(documents);
                                        } catch (Exception e) {
                                            System.out.println(&quot;重复标题,写入失败&quot;);

                                        }

                                    } else {
                                        System.out.println(&quot;新闻日期不否,跳过&quot;);
                                    }

                                }
                            } catch (Exception e) {
                                System.out.println(&quot;被反爬&quot;);

                            }
                        }

                    }

                    break;
                } else {
                    continue;
                }

            }
            this.name = name;
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return this.name;

    }

}

页面列表

ITEM_HTML