python


16、爬取信息、保存至excel(二)

<pre><code class="language-python">from bs4 import BeautifulSoup import urllib.request import pandas as pd import ssl import time import random import xlsxwriter ssl._create_default_https_context = ssl._create_unverified_context # 返回html的soup解析 def openUrl(url): headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req) #请求 html = response.read().decode("utf-8") Soup = BeautifulSoup(html, 'lxml') return Soup url = 'https://www.gucci.cn/zh/ca/women/shoes/pumps?pin=11&amp;pn=1' Soup = openUrl(url) product_name = Soup.find_all(class_="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click") goods_price = Soup.find_all(class_='spice-item-grid-price') imgs = Soup.find_all(class_='visual-hover-img') print(imgs) #print(product_name) #print(goods_price) </code></pre> <pre><code>[&lt;img alt="水晶双G蛇皮浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg 470w"/&gt;, &lt;img alt="蝴蝶结皮革浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2017/5/17/14950303658151893_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2017/5/17/14950303658151893_ws_470X470.jpg 470w"/&gt;, &lt;img alt="GG Marmont系列麂皮低跟鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2017/5/16/14948755883815317_ws_470X470.jpg 470w"/&gt;, &lt;img alt="饰蜜蜂漆皮中跟浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2018/5/5/15254538192304230_ws_470X470.jpg 470w"/&gt;, &lt;img alt="蜜蜂装饰漆皮中跟鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2017/6/7/14968447071421011_ws_470X470.jpg 470w"/&gt;, &lt;img alt="蜜蜂装饰漆皮中跟鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2017/6/7/14968438178803005_ws_470X470.jpg 470w"/&gt;, &lt;img alt="饰蜜蜂漆皮中跟浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2018/5/9/15258151028805179_ws_470X470.jpg 470w"/&gt;]</code></pre> <pre><code class="language-python">print(len(imgs))</code></pre> <pre><code>29</code></pre> <pre><code class="language-python">print(len(product_name))</code></pre> <pre><code>58</code></pre> <pre><code class="language-python">print(len(goods_price))</code></pre> <pre><code>58</code></pre> <pre><code class="language-python">goods_price</code></pre> <pre><code>[&lt;div class="spice-item-grid-price"&gt; &lt;p&gt;&lt;del&gt;&lt;/del&gt;&lt;/p&gt; &lt;p&gt;¥9,700&lt;/p&gt; &lt;/div&gt;, &lt;div class="spice-item-grid-price"&gt; &lt;p&gt;&lt;del&gt;&lt;/del&gt;&lt;/p&gt; &lt;p&gt;¥9,700&lt;/p&gt; &lt;/div&gt;, &lt;div class="spice-item-grid-price"&gt; &lt;p&gt;&lt;del&gt;&lt;/del&gt;&lt;/p&gt; &lt;p&gt;¥7,200&lt;/p&gt; &lt;/div&gt;, &lt;div class="spice-item-grid-price"&gt; &lt;p&gt;&lt;del&gt;&lt;/del&gt;&lt;/p&gt; &lt;p&gt;¥7,000&lt;/p&gt; &lt;/div&gt;, &lt;div class="spice-item-grid-price"&gt; &lt;p&gt;&lt;del&gt;&lt;/del&gt;&lt;/p&gt; &lt;p&gt;¥7,000&lt;/p&gt; &lt;/div&gt;]</code></pre> <pre><code class="language-python">goods_price[0]</code></pre> <pre><code>&lt;div class="spice-item-grid-price"&gt; &lt;p&gt;&lt;del&gt;&lt;/del&gt;&lt;/p&gt; &lt;p&gt;¥9,700&lt;/p&gt; &lt;/div&gt;</code></pre> <pre><code class="language-python">goods_price[0].text</code></pre> <pre><code>'\n\n¥9,700\n'</code></pre> <pre><code class="language-python">def re_str(str): return str.replace(' ','').replace('\r','').replace('\n','').replace('\t','') </code></pre> <h2>第i个产品价格 相邻两个价格一样</h2> <pre><code class="language-python">re_str(goods_price[0].text)</code></pre> <pre><code>'¥9,700'</code></pre> <pre><code class="language-python">re_str(goods_price[3].text)</code></pre> <pre><code>'¥6,800'</code></pre> <pre><code class="language-python">product_name[0]</code></pre> <pre><code>&lt;div class="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click" e-abtest-code="548854LXR109089" e-abtest-position="1"&gt; &lt;img alt="水晶双G蛇皮浅口鞋" class="visual-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png" srcset="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png 158w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png 316w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png 540w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png 235w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png 470w"/&gt; &lt;/div&gt;</code></pre> <pre><code class="language-python">product_name[0].img['srcset']</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png 158w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png 316w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png 540w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png 235w,\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\thttps://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png 470w'</code></pre> <pre><code class="language-python">re_str(product_name[0].img['srcset'])</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png158w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png316w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png540w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png235w,https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png470w'</code></pre> <pre><code class="language-python">re_str(product_name[0].img['srcset']).split(",")</code></pre> <pre><code>['https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png158w', 'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png316w', 'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png540w', 'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png235w', 'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png470w']</code></pre> <pre><code class="language-python">img_lists = re_str(product_name[0].img['srcset']).split(",")</code></pre> <pre><code class="language-python">img_lists[3]</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png235w'</code></pre> <pre><code class="language-python">img_url = img_lists[3]</code></pre> <pre><code class="language-python">img_url[0:len(img_url)-4]</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png'</code></pre> <pre><code class="language-python">re_str(product_name[1].img['srcset']).split(",")[3]</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_235X235.jpg235w'</code></pre> <pre><code class="language-python">re_str(product_name[2].img['srcset']).split(",")[3]</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416466889419004_ts_235X235.png235w'</code></pre> <h2>相邻两个图片一个是png,一个是jpg</h2> <pre><code class="language-python">re_str(product_name[3].img['srcset']).split(",")[3]</code></pre> <pre><code>'https://res.gucci.cn/resources/2018/11/8/15416467287116413_ws_235X235.jpg235w'</code></pre> <h2>编号 相邻两个一样</h2> <pre><code class="language-python"></code></pre> <pre><code class="language-python">product_name[0]['e-abtest-code']</code></pre> <pre><code>'548854LXR109089'</code></pre> <pre><code class="language-python">product_name[1]['e-abtest-code']</code></pre> <pre><code>'548854LXR109089'</code></pre> <pre><code class="language-python">product_name[2]['e-abtest-code']</code></pre> <pre><code>'548855C9D001000'</code></pre> <pre><code class="language-python">product_name[3]['e-abtest-code']</code></pre> <pre><code>'548855C9D001000'</code></pre> <pre><code class="language-python">imgs[0]</code></pre> <pre><code>&lt;img alt="水晶双G蛇皮浅口鞋" class="visual-hover-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg" srcset="https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_158X158.jpg 158w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_316X316.jpg 316w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_540X540.jpg 540w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_235X235.jpg 235w, https://res.gucci.cn/resources/2018/11/8/15416460432374077_ws_470X470.jpg 470w"/&gt;</code></pre> <pre><code class="language-python">imgs[0]['alt']</code></pre> <pre><code>'水晶双G蛇皮浅口鞋'</code></pre> <pre><code class="language-python">imgs[1]['alt']</code></pre> <pre><code>'蝴蝶结皮革浅口鞋'</code></pre> <pre><code class="language-python">imgs[2]['alt']</code></pre> <pre><code>'双G皮革中跟浅口鞋'</code></pre> <pre><code class="language-python">product_name[0]</code></pre> <pre><code>&lt;div class="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click" e-abtest-code="548854LXR109089" e-abtest-position="1"&gt; &lt;img alt="水晶双G蛇皮浅口鞋" class="visual-img" sizes="(max-width: 767px) 158px, (max-width: 1023px) 235px, 470px" src="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png" srcset="https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_158X158.png 158w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_316X316.png 316w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_540X540.png 540w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_235X235.png 235w, https://res.gucci.cn/resources/2018/11/8/15416460113603089_ts_470X470.png 470w"/&gt; &lt;/div&gt;</code></pre> <pre><code class="language-python">product_name[0].img['alt']</code></pre> <pre><code>'水晶双G蛇皮浅口鞋'</code></pre> <pre><code class="language-python">product_name[1].img['alt']</code></pre> <pre><code>'水晶双G蛇皮浅口鞋'</code></pre> <pre><code class="language-python">product_name[2].img['alt']</code></pre> <pre><code>'蝴蝶结皮革浅口鞋'</code></pre> <h3>完整写法</h3> <pre><code>from bs4 import BeautifulSoup import urllib.request import pandas as pd import ssl import time import random import xlsxwriter ssl._create_default_https_context = ssl._create_unverified_context # 返回html的soup解析 def openUrl(url): headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req) #请求 html = response.read().decode("utf-8") #print(html) Soup = BeautifulSoup(html, 'lxml') return Soup # 正则化str def re_str(str): return str.replace(' ','').replace('\r','').replace('\n','').replace('\t','') # 高跟鞋 url = 'https://www.gucci.cn/zh/ca/women/shoes/pumps?pin=11&amp;pn=3' # 男—毛衣 url_sweaters = 'https://www.gucci.cn/zh/ca/men/readytowear/new-sweaters-cardigans?pn=1' Soup = openUrl(url_sweaters) product_infors = Soup.find_all(class_="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click") goods_prices = Soup.find_all(class_='spice-item-grid-price') product_name_list = [] product_item_number_list = [] product_img_list = [] product_price_list = [] for i in range(0,len(goods_prices)//2): product_name_list.append(product_infors[2*i].img['alt']) product_item_number_list.append(product_infors[2*i]['e-abtest-code']) product_price_list.append(re_str(goods_prices[2*i].text)) img_lists = re_str(product_infors[2*i].img['srcset']).split(",") img_url = img_lists[3] img_url = img_url[0:len(img_url) - 4] product_img_list.append(img_url) for i in range(0,len(product_name_list)): print(product_name_list[i]) print(product_price_list[i]) print(product_item_number_list[i]) print(product_img_list[i]) product_name = re_str(product_infor[0].text) goods_price = re_str(goods_price[0].text) item_title = re_str(item_title[0].text) imgs = re_str(str(img[0].img["spice-data-image-src"]))#re_str(img[0]) imgs_list = imgs.split(",") img2_list = imgs_list[2].split(":") img_url = 'https:' + img2_list[2] img_url = img_url[0:len(img_url)-1] #return product_name,goods_price,item_title,img_url #imgs = Soup.find_all(class_='visual-hover-img') # product_name = Soup.find_all(class_="spice-item-grid-img-box e-abtest-all-click e-abtest-code-click") goods_price = Soup.find_all(class_='spice-item-grid-price') # imgs = Soup.find_all(class_='visual-hover-img') # print(imgs) #print(product_name) print(len(goods_price)) print(goods_price) ''' product_name = Soup.find_all(class_='spice-product-name') item_title = Soup.find_all(class_="spice-style-number-title") img = Soup.find_all(class_="spice-standard-image") not_find = Soup.find_all(class_="spice-payment-img-content") # -------提取信息--------- if len(product_name) &gt; 0: # 找到商品 product_name = re_str(product_name[0].text) goods_price = re_str(goods_price[0].text) item_title = re_str(item_title[0].text) imgs = re_str(str(img[0].img["spice-data-image-src"]))#re_str(img[0]) imgs_list = imgs.split(",") img2_list = imgs_list[2].split(":") img_url = 'https:' + img2_list[2] img_url = img_url[0:len(img_url)-1] return product_name,goods_price,item_title,img_url else: # 找不到 return 0, 0, 0, 0 '''</code></pre>

页面列表

ITEM_HTML