爬虫案例技能
<h1>1. 爬虫小案例</h1>
<h3>1.1.1. 爬虫步骤</h3>
<ul>
<li>明确目标(确定在哪个网站搜索)</li>
<li>爬(爬下内容)</li>
<li>取(筛选想要的)</li>
<li>
<p>处理数据(按照你的想法去处理)</p>
<p>package main</p>
<p>import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
)</p>
<p>//这个只是一个简单的版本只是获取QQ邮箱并且没有进行封装操作,另外爬出来的数据也没有进行去重操作
var (
// \d是数字
reQQEmail = <code>(\d+)@qq.com</code>
)</p>
<p>// 爬邮箱
func GetEmail() {
// 1.去网站拿数据
resp, err := http.Get("<a href="https://tieba.baidu.com/p/6051076813?red_tag=1573533731">https://tieba.baidu.com/p/6051076813?red_tag=1573533731</a>")
HandleError(err, "http.Get url")
defer resp.Body.Close()
// 2.读取页面内容
pageBytes, err := ioutil.ReadAll(resp.Body)
HandleError(err, "ioutil.ReadAll")
// 字节转字符串
pageStr := string(pageBytes)
//fmt.Println(pageStr)
// 3.过滤数据,过滤qq邮箱
re := regexp.MustCompile(reQQEmail)
// -1代表取全部
results := re.FindAllStringSubmatch(pageStr, -1)
//fmt.Println(results)</p>
<pre><code>// 遍历结果
for _, result := range results {
fmt.Println("email:", result[0])
fmt.Println("qq:", result[1])
}</code></pre>
<p>}</p>
<p>// 处理异常
func HandleError(err error, why string) {
if err != nil {
fmt.Println(why, err)
}
}
func main() {
GetEmail()
}</p>
</li>
</ul>
<h3>1.1.2. 正则表达式</h3>
<ul>
<li>文档:<a href="https://studygolang.com/pkgdoc"><a href="https://studygolang.com/pkgdoc">https://studygolang.com/pkgdoc</a></a></li>
<li>API
<ul>
<li>re := regexp.MustCompile(reStr),传入正则表达式,得到正则表达式对象</li>
<li>ret := re.FindAllStringSubmatch(srcStr,-1):用正则对象,获取页面页面,srcStr是页面内容,-1代表取全部</li>
</ul></li>
<li>爬邮箱</li>
<li>方法抽取</li>
<li>爬超链接</li>
<li>爬手机号
<ul>
<li><a href="http://www.zhaohaowang.com/"><a href="http://www.zhaohaowang.com/">http://www.zhaohaowang.com/</a></a> 如果连接失效了自己找一个有手机号的就好了</li>
</ul></li>
<li>爬身份证号
<ul>
<li><a href="http://henan.qq.com/a/20171107/069413.htm"><a href="http://henan.qq.com/a/20171107/069413.htm">http://henan.qq.com/a/20171107/069413.htm</a></a> 如果连接失效了自己找一个就好了</li>
</ul></li>
<li>
<p>爬图片链接</p>
<p>package main</p>
<p>import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
)</p>
<p>var (
// w代表大小写字母+数字+下划线
reEmail = <code>\w+@\w+\.\w+</code>
// s?有或者没有s
// +代表出1次或多次
//\s\S各种字符
// +?代表贪婪模式
reLinke = <code>href="(https?://[\s\S]+?)"</code>
rePhone = <code>1[3456789]\d\s?\d{4}\s?\d{4}</code>
reIdcard = <code>[123456789]\d{5}((19\d{2})|(20[01]\d))((0[1-9])|(1[012]))((0[1-9])|([12]\d)|(3[01]))\d{3}[\dXx]</code>
reImg = <code>https?://[^"]+?(\.((jpg)|(png)|(jpeg)|(gif)|(bmp)))</code>
)</p>
<p>// 处理异常
func HandleError(err error, why string) {
if err != nil {
fmt.Println(why, err)
}
}</p>
<p>func GetEmail2(url string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(reEmail)
results := re.FindAllStringSubmatch(pageStr, -1)
for _, result := range results {
fmt.Println(result)
}
}</p>
<p>// 抽取根据url获取内容
func GetPageStr(url string) (pageStr string) {
resp, err := http.Get(url)
HandleError(err, "http.Get url")
defer resp.Body.Close()
// 2.读取页面内容
pageBytes, err := ioutil.ReadAll(resp.Body)
HandleError(err, "ioutil.ReadAll")
// 字节转字符串
pageStr = string(pageBytes)
return pageStr
}</p>
<p>func main() {
// 2.抽取的爬邮箱
// GetEmail2("<a href="https://tieba.baidu.com/p/6051076813?red_tag=1573533731">https://tieba.baidu.com/p/6051076813?red_tag=1573533731</a>")
// 3.爬链接
//GetLink("<a href="http://www.baidu.com/s?wd=%E8%B4%B4%E5%90%A7%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&rsv_spt=1&rsv_iqid=0x98ace53400003985&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=ib&rsv_sug2=0&inputT=5197&rsv_sug4=6345">http://www.baidu.com/s?wd=%E8%B4%B4%E5%90%A7%20%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&rsv_spt=1&rsv_iqid=0x98ace53400003985&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=ib&rsv_sug2=0&inputT=5197&rsv_sug4=6345</a>")
// 4.爬手机号
//GetPhone("<a href="https://www.zhaohaowang.com/">https://www.zhaohaowang.com/</a>")
// 5.爬身份证号
//GetIdCard("<a href="https://henan.qq.com/a/20171107/069413.htm">https://henan.qq.com/a/20171107/069413.htm</a>")
// 6.爬图片
// GetImg("<a href="http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%BE%8E%E5%A5%B3">http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%BE%8E%E5%A5%B3</a>")
}</p>
<p>func GetIdCard(url string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(reIdcard)
results := re.FindAllStringSubmatch(pageStr, -1)
for _, result := range results {
fmt.Println(result)
}
}</p>
<p>// 爬链接
func GetLink(url string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(reLinke)
results := re.FindAllStringSubmatch(pageStr, -1)
for _, result := range results {
fmt.Println(result[1])
}
}</p>
<p>//爬手机号
func GetPhone(url string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(rePhone)
results := re.FindAllStringSubmatch(pageStr, -1)
for _, result := range results {
fmt.Println(result)
}
}</p>
<p>func GetImg(url string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(reImg)
results := re.FindAllStringSubmatch(pageStr, -1)
for _, result := range results {
fmt.Println(result[0])
}
}</p>
</li>
</ul>
<h3>1.1.3. 并发爬取美图</h3>
<p>下面的两个是即将要爬的网站,如果网址失效自己换一个就好了</p>
<ul>
<li>
<p><a href="https://www.bizhizu.cn/shouji/tag-%E5%8F%AF%E7%88%B1/1.html"><a href="https://www.bizhizu.cn/shouji/tag-%E5%8F%AF%E7%88%B1/1.html">https://www.bizhizu.cn/shouji/tag-%E5%8F%AF%E7%88%B1/1.html</a></a></p>
<p>package main</p>
<p>import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
)</p>
<p>func HandleError(err error, why string) {
if err != nil {
fmt.Println(why, err)
}
}</p>
<p>// 下载图片,传入的是图片叫什么
func DownloadFile(url string, filename string) (ok bool) {
resp, err := http.Get(url)
HandleError(err, "http.get.url")
defer resp.Body.Close()
bytes, err := ioutil.ReadAll(resp.Body)
HandleError(err, "resp.body")
filename = "E:/topgoer.com/src/github.com/student/3.0/img/" + filename
// 写出数据
err = ioutil.WriteFile(filename, bytes, 0666)
if err != nil {
return false
} else {
return true
}
}</p>
<p>// 并发爬思路:
// 1.初始化数据管道
// 2.爬虫写出:26个协程向管道中添加图片链接
// 3.任务统计协程:检查26个任务是否都完成,完成则关闭数据管道
// 4.下载协程:从管道里读取链接并下载</p>
<p>var (
// 存放图片链接的数据管道
chanImageUrls chan string
waitGroup sync.WaitGroup
// 用于监控协程
chanTask chan string
reImg = <code>https?://[^"]+?(\.((jpg)|(png)|(jpeg)|(gif)|(bmp)))</code>
)</p>
<p>func main() {
// myTest()
// DownloadFile("<a href="http://i1.shaodiyejin.com/uploads/tu/201909/10242/e5794daf58_4.jpg">http://i1.shaodiyejin.com/uploads/tu/201909/10242/e5794daf58_4.jpg</a>", "1.jpg")</p>
<pre><code>// 1.初始化管道
chanImageUrls = make(chan string, 1000000)
chanTask = make(chan string, 26)
// 2.爬虫协程
for i := 1; i < 27; i++ {
waitGroup.Add(1)
go getImgUrls("https://www.bizhizu.cn/shouji/tag-%E5%8F%AF%E7%88%B1/" + strconv.Itoa(i) + ".html")
}
// 3.任务统计协程,统计26个任务是否都完成,完成则关闭管道
waitGroup.Add(1)
go CheckOK()
// 4.下载协程:从管道中读取链接并下载
for i := 0; i < 5; i++ {
waitGroup.Add(1)
go DownloadImg()
}
waitGroup.Wait()</code></pre>
<p>}</p>
<p>// 下载图片
func DownloadImg() {
for url := range chanImageUrls {
filename := GetFilenameFromUrl(url)
ok := DownloadFile(url, filename)
if ok {
fmt.Printf("%s 下载成功\n", filename)
} else {
fmt.Printf("%s 下载失败\n", filename)
}
}
waitGroup.Done()
}</p>
<p>// 截取url名字
func GetFilenameFromUrl(url string) (filename string) {
// 返回最后一个/的位置
lastIndex := strings.LastIndex(url, "/")
// 切出来
filename = url[lastIndex+1:]
// 时间戳解决重名
timePrefix := strconv.Itoa(int(time.Now().UnixNano()))
filename = timePrefix + "_" + filename
return
}</p>
<p>// 任务统计协程
func CheckOK() {
var count int
for {
url := <-chanTask
fmt.Printf("%s 完成了爬取任务\n", url)
count++
if count == 26 {
close(chanImageUrls)
break
}
}
waitGroup.Done()
}</p>
<p>// 爬图片链接到管道
// url是传的整页链接
func getImgUrls(url string) {
urls := getImgs(url)
// 遍历切片里所有链接,存入数据管道
for _, url := range urls {
chanImageUrls <- url
}
// 标识当前协程完成
// 每完成一个任务,写一条数据
// 用于监控协程知道已经完成了几个任务
chanTask <- url
waitGroup.Done()
}</p>
<p>// 获取当前页图片链接
func getImgs(url string) (urls []string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(reImg)
results := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到%d条结果\n", len(results))
for _, result := range results {
url := result[0]
urls = append(urls, url)
}
return
}</p>
<p>// 抽取根据url获取内容
func GetPageStr(url string) (pageStr string) {
resp, err := http.Get(url)
HandleError(err, "http.Get url")
defer resp.Body.Close()
// 2.读取页面内容
pageBytes, err := ioutil.ReadAll(resp.Body)
HandleError(err, "ioutil.ReadAll")
// 字节转字符串
pageStr = string(pageBytes)
return pageStr
}</p>
</li>
</ul>