九阴真经


爬虫

<h3>HtmlUnit+Jsoup</h3> <ul> <li>依赖:</li> </ul> <pre><code> &lt;dependency&gt; &lt;groupId&gt;net.sourceforge.htmlunit&lt;/groupId&gt; &lt;artifactId&gt;htmlunit&lt;/artifactId&gt; &lt;version&gt;2.23&lt;/version&gt; &lt;/dependency&gt; &lt;dependency&gt; &lt;groupId&gt;org.jsoup&lt;/groupId&gt; &lt;artifactId&gt;jsoup&lt;/artifactId&gt; &lt;version&gt;1.13.1&lt;/version&gt; &lt;/dependency&gt;</code></pre> <ul> <li>案例:</li> </ul> <pre><code>package com.example.demo.spider; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlPage; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.web.client.RestClientException; import org.springframework.web.client.RestTemplate; import java.util.LinkedHashSet; import java.util.Set; /** * 爬虫 * * @author jxd * @version 1.0 * * @date 2021/3/25 9:31 */ public class App { public static void main(String[] args) { String url = "https://mvnrepository.com/artifact/org.jsoup/jsoup/1.13.1"; String regex = "^/.*"; String cssQuery = ".categories li a"; String attribute = ""; Set&lt;String&gt; set = getValByUrlAndCssSelect(url, cssQuery, attribute, regex); System.out.println("爬虫结果:::"); for (String s : set) { System.out.println(s); } } /* * * * @param url 网页http地址 * @param cssQuery css选择器 * @param attributeKey 属性key * @param regex 要匹配值得正则表达式 * @author jxd * @date 2021/3/25 10:07 * @return 符合要求的结果集集合 */ public static Set&lt;String&gt; getValByUrlAndCssSelect(String url, String cssQuery, String attributeKey, String regex) { String html = null; try { html = new RestTemplate().getForObject(url, String.class); } catch (RestClientException e) { e.printStackTrace(); //模拟浏览器抓取 html = getByBrowser(url); } Set&lt;String&gt; valSet = new LinkedHashSet&lt;&gt;(); try { Document document = Jsoup.parse(html); Elements list = document.select(cssQuery); for (Element element : list) { String content = null; if (attributeKey instanceof String &amp;&amp; attributeKey.length() &gt; 1) { content = element.attr(attributeKey); if (content instanceof String &amp;&amp; content.matches(regex)) { valSet.add(content); } } else { content = element.text(); valSet.add(content); } } } catch (Exception e) { e.printStackTrace(); } return valSet; } /* * *通过浏览器请求html * @param url 请求地址 * @author jxd * @date 2021/3/25 10:48 * @return */ public static String getByBrowser(String url) { try { WebClient webClient = new WebClient(BrowserVersion.CHROME); webClient.getOptions().setJavaScriptEnabled(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setRedirectEnabled(true); HtmlPage htmlPage = webClient.getPage(url); String html = htmlPage.getWebResponse().getContentAsString(); webClient.close(); return html; } catch (Exception e) { e.printStackTrace(); return ""; } } } </code></pre>

页面列表

ITEM_HTML