Legendary

李洋的学习笔记


requests

<ol> <li>安装:<code>pip install requests</code></li> <li>查看:<code>pip show requests</code></li> <li>使用:</li> </ol> <pre><code># 导入包 import requests # 参数 data = {} # 请求头 headers = {} # 发送post请求并接收响应结果 postRes = request.post(url, data = data, headers = headers) # 发送get请求并接收响应结果 # getRes = request.get(url) resCode = res.status_code resHeaders = res.headers resUrl = res.url resText = res.text resContent = res.content </code></pre> <p>爬取西祠代理ip:</p> <pre><code>import requests import lxml import json from bs4 import BeautifulSoup def SendReq(page): url = f'https://www.xicidaili.com/nn/{page}' headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400' } res = requests.get(url,headers=headers) if res.status_code == 200: GetIp(res,page) else: print(f'第{page}页数据请求失败') return False def GetIp(res,page): soup = BeautifulSoup(res.text,'lxml') ipTdList = soup.select('tr&gt;td:nth-child(2)') portTdList = soup.select('tr&gt;td:nth-child(3)') ipList = [] portList = [] for ip in ipTdList: ipList.append(ip.get_text()) for port in portTdList: portList.append(port.get_text()) counti = 0 countp = 0 ipDict = [] for i in ipList: for p in portList: if counti == countp : ipItem = { 'ip': i, 'port':p } ipDict.append(ipItem) countp += 1 counti += 1 countp = 0 old_data = GetFileJson() if old_data!= {}: ipDict += old_data AddIp(ipDict,page) def GetFileJson(): with open('./ip.json','r') as fr: old_data = json.load(fr) return old_data def AddIp(ipDict,page): with open('./ip.json','w') as fw: json.dump(ipDict,fw) print(f'第{page}页数据写入成功') # response = SendReq() # ipDict = GetIp(response) # AddIp(ipDict) # GetFileJson() time=1 while(time&lt;1000): print(time) SendReq(time) time += 1</code></pre> <p>自己爬自己</p> <pre><code>import requests import lxml import json import random from bs4 import BeautifulSoup # 发送请求 def SendReq(page): url = f'https://www.xicidaili.com/nn/{page}' headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400' } tryIp = SetMyIp() proxies = { "http": f"http://{tryIp['ip']}:{tryIp['port']}", "https": f"http://{tryIp['ip']}:{tryIp['port']}" } print(proxies) # 捕获请求异常 try: res = requests.get(url,headers=headers,proxies = proxies,timeout=3) except requests.exceptions.Timeout: print('请求超时') SendReq(page) except requests.exceptions.ConnectionError: print('拒绝连接') SendReq(page) else: print(res.status_code) SelectIp(proxies) if res.status_code == 200: # print(res.headers) # 请求成功,继续 GetIp(res,page) else: print(f'第{page}页数据请求失败') return False # 打开已爬取的ip.json,随机从里面获取ip来使用 def SetMyIp(): ipDict = GetFileJson() ipLen = len(ipDict) randomNum = random.random() ipIndex = int(randomNum*ipLen) return ipDict[ipIndex] # 处理已经爬取到的数据,提取ip和port def GetIp(res,page): soup = BeautifulSoup(res.text,'lxml') ipTdList = soup.select('tr&gt;td:nth-child(2)') portTdList = soup.select('tr&gt;td:nth-child(3)') ipList = [] portList = [] for ip in ipTdList: ipList.append(ip.get_text()) for port in portTdList: portList.append(port.get_text()) counti = 0 countp = 0 ipDict = [] for i in ipList: for p in portList: if counti == countp : ipItem = { 'ip': i, 'port':p } ipDict.append(ipItem) countp += 1 counti += 1 countp = 0 old_data = GetFileJson() if old_data!= {}: ipDict += old_data AddIp(ipDict,page) # 打开已爬取的ip.json def GetFileJson(): with open('./ip.json','r') as fr: old_data = json.load(fr) return old_data # 将新提取到的ip.json添加到ip.json def AddIp(ipDict,page): with open('./ip.json','w') as fw: json.dump(ipDict,fw) print(f'第{page}页数据写入成功') # 打开已筛选好的可用ip字典UsabledIp.json def GetUsabledIp(): with open('./UsabledIp.json','r') as ruip: old_ipDic = json.load(ruip) return old_ipDic # 筛选可用ip,添加到UsabledIp.json def SelectIp(proxies): usabledIpDic = GetUsabledIp() usabledIpDic.append(proxies) with open('./UsabledIp.json','w') as fuip: json.dump(usabledIpDic,fuip) time=1 while(time&lt;5): print(time) SendReq(time) time += 1 # testHttp = {'http': 'http://112.239.22.101:9999', 'https': 'http://112.239.22.101:9999'} # SelectIp(testHttp)</code></pre> <p>可用ip</p> <pre><code>[{"http": "http://223.199.31.104:9999", "https": "http://223.199.31.104:9999"}, {"http": "http://112.239.22.101:9999", "https": "http://112.239.22.101:9999"}, {"http": "http://36.27.29.34:9999", "https": "http://36.27.29.34:9999"}]</code></pre>

页面列表

ITEM_HTML