爬虫
<h3>HtmlUnit+Jsoup</h3>
<ul>
<li>依赖:</li>
</ul>
<pre><code> <dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.23</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency></code></pre>
<ul>
<li>案例:</li>
</ul>
<pre><code>package com.example.demo.spider;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.client.RestClientException;
import org.springframework.web.client.RestTemplate;
import java.util.LinkedHashSet;
import java.util.Set;
/**
* 爬虫
*
* @author jxd
* @version 1.0 *
* @date 2021/3/25 9:31
*/
public class App {
public static void main(String[] args) {
String url = "https://mvnrepository.com/artifact/org.jsoup/jsoup/1.13.1";
String regex = "^/.*";
String cssQuery = ".categories li a";
String attribute = "";
Set<String> set = getValByUrlAndCssSelect(url, cssQuery, attribute, regex);
System.out.println("爬虫结果:::");
for (String s : set) {
System.out.println(s);
}
}
/*
*
*
* @param url 网页http地址
* @param cssQuery css选择器
* @param attributeKey 属性key
* @param regex 要匹配值得正则表达式
* @author jxd
* @date 2021/3/25 10:07
* @return 符合要求的结果集集合
*/
public static Set<String> getValByUrlAndCssSelect(String url, String cssQuery, String attributeKey, String regex) {
String html = null;
try {
html = new RestTemplate().getForObject(url, String.class);
} catch (RestClientException e) {
e.printStackTrace();
//模拟浏览器抓取
html = getByBrowser(url);
}
Set<String> valSet = new LinkedHashSet<>();
try {
Document document = Jsoup.parse(html);
Elements list = document.select(cssQuery);
for (Element element : list) {
String content = null;
if (attributeKey instanceof String && attributeKey.length() > 1) {
content = element.attr(attributeKey);
if (content instanceof String && content.matches(regex)) {
valSet.add(content);
}
} else {
content = element.text();
valSet.add(content);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return valSet;
}
/*
*
*通过浏览器请求html
* @param url 请求地址
* @author jxd
* @date 2021/3/25 10:48
* @return
*/
public static String getByBrowser(String url) {
try {
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setRedirectEnabled(true);
HtmlPage htmlPage = webClient.getPage(url);
String html = htmlPage.getWebResponse().getContentAsString();
webClient.close();
return html;
} catch (Exception e) {
e.printStackTrace();
return "";
}
}
}
</code></pre>