16、爬取信息、保存至excel(二)
<pre><code class="language-python">from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import ssl
import time
import random
import xlsxwriter
ssl._create_default_https_context = ssl._create_unverified_context
# 返回html的soup解析
def openUrl(url):
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req) #请求
html = response.read().decode("utf-8")
Soup = BeautifulSoup(html, 'lxml')
return Soup
url = 'https://www.gucci.cn/zh/ca/women/shoes/pumps?pin=11&pn=1'
Soup = openUrl(url)
product_name = Soup.find_all(class_="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click")
goods_price = Soup.find_all(class_='spice-item-grid-price')
imgs = Soup.find_all(class_='visual-hover-img')
print(imgs)
#print(product_name)
#print(goods_price)
</code></pre>
<pre><code>[<img alt="水晶双G蛇皮浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg 470w"/>, <img alt="蝴蝶结皮革浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2017/5/17/14950303658151893_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2017/5/17/14950303658151893_ws_470X470.jpg 470w"/>, <img alt="GG Marmont系列麂皮低跟鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_470X470.jpg 470w"/>, <img alt="饰蜜蜂漆皮中跟浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_470X470.jpg 470w"/>, <img alt="蜜蜂装饰漆皮中跟鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_470X470.jpg 470w"/>, <img alt="蜜蜂装饰漆皮中跟鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_470X470.jpg 470w"/>, <img alt="饰蜜蜂漆皮中跟浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_470X470.jpg 470w"/>]</code></pre>
<pre><code class="language-python">print(len(imgs))</code></pre>
<pre><code>29</code></pre>
<pre><code class="language-python">print(len(product_name))</code></pre>
<pre><code>58</code></pre>
<pre><code class="language-python">print(len(goods_price))</code></pre>
<pre><code>58</code></pre>
<pre><code class="language-python">goods_price</code></pre>
<pre><code>[<div class="spice-item-grid-price">
<p><del></del></p>
<p>¥9,700</p>
</div>, <div class="spice-item-grid-price">
<p><del></del></p>
<p>¥9,700</p>
</div>, <div class="spice-item-grid-price">
<p><del></del></p>
<p>¥7,200</p>
</div>, <div class="spice-item-grid-price">
<p><del></del></p>
<p>¥7,000</p>
</div>, <div class="spice-item-grid-price">
<p><del></del></p>
<p>¥7,000</p>
</div>]</code></pre>
<pre><code class="language-python">goods_price[0]</code></pre>
<pre><code><div class="spice-item-grid-price">
<p><del></del></p>
<p>¥9,700</p>
</div></code></pre>
<pre><code class="language-python">goods_price[0].text</code></pre>
<pre><code>'\n\n¥9,700\n'</code></pre>
<pre><code class="language-python">def re_str(str):
return str.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
</code></pre>
<h2>第i个产品价格 相邻两个价格一样</h2>
<pre><code class="language-python">re_str(goods_price[0].text)</code></pre>
<pre><code>'¥9,700'</code></pre>
<pre><code class="language-python">re_str(goods_price[3].text)</code></pre>
<pre><code>'¥6,800'</code></pre>
<pre><code class="language-python">product_name[0]</code></pre>
<pre><code><div class="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click" e-abtest-code="548854LXR109089" e-abtest-position="1">
<img alt="水晶双G蛇皮浅口鞋" class="visual-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png" srcset="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png 158w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png 316w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png 540w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png 235w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png 470w"/>
</div></code></pre>
<pre><code class="language-python">product_name[0].img['srcset']</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png 158w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png 316w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png 540w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png 235w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png 470w'</code></pre>
<pre><code class="language-python">re_str(product_name[0].img['srcset'])</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png158w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png316w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png540w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png235w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png470w'</code></pre>
<pre><code class="language-python">re_str(product_name[0].img['srcset']).split(",")</code></pre>
<pre><code>['https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png158w',
'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png316w',
'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png540w',
'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png235w',
'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png470w']</code></pre>
<pre><code class="language-python">img_lists = re_str(product_name[0].img['srcset']).split(",")</code></pre>
<pre><code class="language-python">img_lists[3]</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png235w'</code></pre>
<pre><code class="language-python">img_url = img_lists[3]</code></pre>
<pre><code class="language-python">img_url[0:len(img_url)-4]</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png'</code></pre>
<pre><code class="language-python">re_str(product_name[1].img['srcset']).split(",")[3]</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_235X235.jpg235w'</code></pre>
<pre><code class="language-python">re_str(product_name[2].img['srcset']).split(",")[3]</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416466889419004_ts_235X235.png235w'</code></pre>
<h2>相邻两个图片一个是png,一个是jpg</h2>
<pre><code class="language-python">re_str(product_name[3].img['srcset']).split(",")[3]</code></pre>
<pre><code>'https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_235X235.jpg235w'</code></pre>
<h2>编号 相邻两个一样</h2>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python">product_name[0]['e-abtest-code']</code></pre>
<pre><code>'548854LXR109089'</code></pre>
<pre><code class="language-python">product_name[1]['e-abtest-code']</code></pre>
<pre><code>'548854LXR109089'</code></pre>
<pre><code class="language-python">product_name[2]['e-abtest-code']</code></pre>
<pre><code>'548855C9D001000'</code></pre>
<pre><code class="language-python">product_name[3]['e-abtest-code']</code></pre>
<pre><code>'548855C9D001000'</code></pre>
<pre><code class="language-python">imgs[0]</code></pre>
<pre><code><img alt="水晶双G蛇皮浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_158X158.jpg 158w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_316X316.jpg 316w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_540X540.jpg 540w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_235X235.jpg 235w,
https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg 470w"/></code></pre>
<pre><code class="language-python">imgs[0]['alt']</code></pre>
<pre><code>'水晶双G蛇皮浅口鞋'</code></pre>
<pre><code class="language-python">imgs[1]['alt']</code></pre>
<pre><code>'蝴蝶结皮革浅口鞋'</code></pre>
<pre><code class="language-python">imgs[2]['alt']</code></pre>
<pre><code>'双G皮革中跟浅口鞋'</code></pre>
<pre><code class="language-python">product_name[0]</code></pre>
<pre><code><div class="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click" e-abtest-code="548854LXR109089" e-abtest-position="1">
<img alt="水晶双G蛇皮浅口鞋" class="visual-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png" srcset="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png 158w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png 316w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png 540w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png 235w,
https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png 470w"/>
</div></code></pre>
<pre><code class="language-python">product_name[0].img['alt']</code></pre>
<pre><code>'水晶双G蛇皮浅口鞋'</code></pre>
<pre><code class="language-python">product_name[1].img['alt']</code></pre>
<pre><code>'水晶双G蛇皮浅口鞋'</code></pre>
<pre><code class="language-python">product_name[2].img['alt']</code></pre>
<pre><code>'蝴蝶结皮革浅口鞋'</code></pre>
<h3>完整写法</h3>
<pre><code>from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import ssl
import time
import random
import xlsxwriter
ssl._create_default_https_context = ssl._create_unverified_context
# 返回html的soup解析
def openUrl(url):
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req) #请求
html = response.read().decode("utf-8")
#print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
# 正则化str
def re_str(str):
return str.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
# 高跟鞋
url = 'https://www.gucci.cn/zh/ca/women/shoes/pumps?pin=11&pn=3'
# 男—毛衣
url_sweaters = 'https://www.gucci.cn/zh/ca/men/readytowear/new-sweaters-cardigans?pn=1'
Soup = openUrl(url_sweaters)
product_infors = Soup.find_all(class_="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click")
goods_prices = Soup.find_all(class_='spice-item-grid-price')
product_name_list = []
product_item_number_list = []
product_img_list = []
product_price_list = []
for i in range(0,len(goods_prices)//2):
product_name_list.append(product_infors[2*i].img['alt'])
product_item_number_list.append(product_infors[2*i]['e-abtest-code'])
product_price_list.append(re_str(goods_prices[2*i].text))
img_lists = re_str(product_infors[2*i].img['srcset']).split(",")
img_url = img_lists[3]
img_url = img_url[0:len(img_url) - 4]
product_img_list.append(img_url)
for i in range(0,len(product_name_list)):
print(product_name_list[i])
print(product_price_list[i])
print(product_item_number_list[i])
print(product_img_list[i])
product_name = re_str(product_infor[0].text)
goods_price = re_str(goods_price[0].text)
item_title = re_str(item_title[0].text)
imgs = re_str(str(img[0].img["spice-data-image-src"]))#re_str(img[0])
imgs_list = imgs.split(",")
img2_list = imgs_list[2].split(":")
img_url = 'https:' + img2_list[2]
img_url = img_url[0:len(img_url)-1]
#return product_name,goods_price,item_title,img_url
#imgs = Soup.find_all(class_='visual-hover-img')
# product_name = Soup.find_all(class_="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click")
goods_price = Soup.find_all(class_='spice-item-grid-price')
# imgs = Soup.find_all(class_='visual-hover-img')
# print(imgs)
#print(product_name)
print(len(goods_price))
print(goods_price)
'''
product_name = Soup.find_all(class_='spice-product-name')
item_title = Soup.find_all(class_="spice-style-number-title")
img = Soup.find_all(class_="spice-standard-image")
not_find = Soup.find_all(class_="spice-payment-img-content")
# -------提取信息---------
if len(product_name) > 0:
# 找到商品
product_name = re_str(product_name[0].text)
goods_price = re_str(goods_price[0].text)
item_title = re_str(item_title[0].text)
imgs = re_str(str(img[0].img["spice-data-image-src"]))#re_str(img[0])
imgs_list = imgs.split(",")
img2_list = imgs_list[2].split(":")
img_url = 'https:' + img2_list[2]
img_url = img_url[0:len(img_url)-1]
return product_name,goods_price,item_title,img_url
else:
# 找不到
return 0, 0, 0, 0
'''</code></pre>