requests
<ol>
<li>安装:<code>pip install requests</code></li>
<li>查看:<code>pip show requests</code></li>
<li>使用:</li>
</ol>
<pre><code># 导入包
import requests
# 参数
data = {}
# 请求头
headers = {}
# 发送post请求并接收响应结果
postRes = request.post(url, data = data, headers = headers)
# 发送get请求并接收响应结果
# getRes = request.get(url)
resCode = res.status_code
resHeaders = res.headers
resUrl = res.url
resText = res.text
resContent = res.content
</code></pre>
<p>爬取西祠代理ip:</p>
<pre><code>import requests
import lxml
import json
from bs4 import BeautifulSoup
def SendReq(page):
url = f'https://www.xicidaili.com/nn/{page}'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
res = requests.get(url,headers=headers)
if res.status_code == 200:
GetIp(res,page)
else:
print(f'第{page}页数据请求失败')
return False
def GetIp(res,page):
soup = BeautifulSoup(res.text,'lxml')
ipTdList = soup.select('tr>td:nth-child(2)')
portTdList = soup.select('tr>td:nth-child(3)')
ipList = []
portList = []
for ip in ipTdList:
ipList.append(ip.get_text())
for port in portTdList:
portList.append(port.get_text())
counti = 0
countp = 0
ipDict = []
for i in ipList:
for p in portList:
if counti == countp :
ipItem = {
'ip': i,
'port':p
}
ipDict.append(ipItem)
countp += 1
counti += 1
countp = 0
old_data = GetFileJson()
if old_data!= {}:
ipDict += old_data
AddIp(ipDict,page)
def GetFileJson():
with open('./ip.json','r') as fr:
old_data = json.load(fr)
return old_data
def AddIp(ipDict,page):
with open('./ip.json','w') as fw:
json.dump(ipDict,fw)
print(f'第{page}页数据写入成功')
# response = SendReq()
# ipDict = GetIp(response)
# AddIp(ipDict)
# GetFileJson()
time=1
while(time<1000):
print(time)
SendReq(time)
time += 1</code></pre>
<p>自己爬自己</p>
<pre><code>import requests
import lxml
import json
import random
from bs4 import BeautifulSoup
# 发送请求
def SendReq(page):
url = f'https://www.xicidaili.com/nn/{page}'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
tryIp = SetMyIp()
proxies = { "http": f"http://{tryIp['ip']}:{tryIp['port']}", "https": f"http://{tryIp['ip']}:{tryIp['port']}" }
print(proxies)
# 捕获请求异常
try:
res = requests.get(url,headers=headers,proxies = proxies,timeout=3)
except requests.exceptions.Timeout:
print('请求超时')
SendReq(page)
except requests.exceptions.ConnectionError:
print('拒绝连接')
SendReq(page)
else:
print(res.status_code)
SelectIp(proxies)
if res.status_code == 200:
# print(res.headers)
# 请求成功,继续
GetIp(res,page)
else:
print(f'第{page}页数据请求失败')
return False
# 打开已爬取的ip.json,随机从里面获取ip来使用
def SetMyIp():
ipDict = GetFileJson()
ipLen = len(ipDict)
randomNum = random.random()
ipIndex = int(randomNum*ipLen)
return ipDict[ipIndex]
# 处理已经爬取到的数据,提取ip和port
def GetIp(res,page):
soup = BeautifulSoup(res.text,'lxml')
ipTdList = soup.select('tr>td:nth-child(2)')
portTdList = soup.select('tr>td:nth-child(3)')
ipList = []
portList = []
for ip in ipTdList:
ipList.append(ip.get_text())
for port in portTdList:
portList.append(port.get_text())
counti = 0
countp = 0
ipDict = []
for i in ipList:
for p in portList:
if counti == countp :
ipItem = {
'ip': i,
'port':p
}
ipDict.append(ipItem)
countp += 1
counti += 1
countp = 0
old_data = GetFileJson()
if old_data!= {}:
ipDict += old_data
AddIp(ipDict,page)
# 打开已爬取的ip.json
def GetFileJson():
with open('./ip.json','r') as fr:
old_data = json.load(fr)
return old_data
# 将新提取到的ip.json添加到ip.json
def AddIp(ipDict,page):
with open('./ip.json','w') as fw:
json.dump(ipDict,fw)
print(f'第{page}页数据写入成功')
# 打开已筛选好的可用ip字典UsabledIp.json
def GetUsabledIp():
with open('./UsabledIp.json','r') as ruip:
old_ipDic = json.load(ruip)
return old_ipDic
# 筛选可用ip,添加到UsabledIp.json
def SelectIp(proxies):
usabledIpDic = GetUsabledIp()
usabledIpDic.append(proxies)
with open('./UsabledIp.json','w') as fuip:
json.dump(usabledIpDic,fuip)
time=1
while(time<5):
print(time)
SendReq(time)
time += 1
# testHttp = {'http': 'http://112.239.22.101:9999', 'https': 'http://112.239.22.101:9999'}
# SelectIp(testHttp)</code></pre>
<p>可用ip</p>
<pre><code>[{"http": "http://223.199.31.104:9999", "https": "http://223.199.31.104:9999"}, {"http": "http://112.239.22.101:9999", "https": "http://112.239.22.101:9999"}, {"http": "http://36.27.29.34:9999", "https": "http://36.27.29.34:9999"}]</code></pre>