11、爬天猫,使用手机版页面
<pre><code class="language-python">from selenium import webdriver
# browser = webdriver.Chrome()
option = webdriver.ChromeOptions()
option.add_argument('--user-agent=iphone')
browser = webdriver.Chrome(chrome_options=option)
url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%B0%FC&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=pd&style=tile&cat=54294290&end_price=25360'
browser.get(url)</code></pre>
<pre><code>C:\Users\zcr\.conda\envs\py36\lib\site-packages\ipykernel\__main__.py:6: DeprecationWarning: use options instead of chrome_options</code></pre>
<pre><code class="language-python">men_bag_url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%B0%FC&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=pd&style=tile&cat=54294290&end_price=25360'
women_bag_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C5%AE%B0%FC&type=p&spm=875.7931836/B.a2227oh.d100&xl=Gucci_1&from=mallfp..pc_1_suggest&sort=pd&cat=54294290&end_price=39999'
men_shoes_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C4%D0%D0%AC&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&cat=54298748&style=list&end_price=24047'
men_scarve_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C4%D0%CE%A7%BD%ED&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=pd&style=list&end_price=6830'
women_scarve_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C5%AE%CE%A7%BD%ED&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=10970'</code></pre>
<pre><code class="language-python">men_belt_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C4%D0%D1%FC%B4%F8&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=9620'</code></pre>
<pre><code class="language-python">women_belts_url = 'https://list.tmall.com/search_product.htm?q=gucci%C5%AE%D1%FC%B4%F8&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=12563'</code></pre>
<pre><code class="language-python">men_clothes_url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%D7%B0&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=25480'
women_clothes_url = 'https://list.tmall.com/search_product.htm?q=gucciŮװ&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=25940'</code></pre>
<pre><code class="language-python">import hashlib</code></pre>
<pre><code class="language-python">products_set = set()
products_list = []</code></pre>
<pre><code class="language-python">all_url = 'https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=39999'</code></pre>
<pre><code class="language-python">#products_list = []
products_set = set()
products_list = []
url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%B0%FC&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=pd&style=list&cat=54294290&end_price=25360'
url = all_url
browser.get(url)
time.sleep(3)
red_soup = BeautifulSoup(browser.page_source, 'lxml')
list_items = red_soup.find_all(class_='list_item')
for i in range(0,len(list_items)):
product_img_url = list_items[i](class_='li_img_wap')[0].img['src']
product_title = list_items[i](class_='li_img_wap')[0].img['alt']
product_shop_name = list_items[i](class_='lii_sold')[0].span.text
product_sold_info = list_items[i](class_='lii_sold')[0].text
product_url = 'https:' +list_items[i]['href']
a = list_items[i](class_='lii_price')[0].text
b = a[:len(a)-6]
product_price = b[4:len(b)]
hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest()
if hash_title in products_set:
continue
products_set.add(hash_title)
product_info =\
{
'product_img_url':product_img_url,
'product_url':product_url,
'product_title':product_title,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
i = 300
for j in range(1,11):
time.sleep(2)
js="var q=document.documentElement.scrollTop="+str(i)
browser.execute_script(js)
i = i+800
red_soup = BeautifulSoup(browser.page_source, 'lxml')
list_items = red_soup.find_all(class_='list_item')
for i in range(0,len(list_items)):
product_img_url = list_items[i](class_='li_img_wap')[0].img['src']
product_title = list_items[i](class_='li_img_wap')[0].img['alt']
product_shop_name = list_items[i](class_='lii_sold')[0].span.text
product_sold_info = list_items[i](class_='lii_sold')[0].text
product_url = 'https:' +list_items[i]['href']
a = list_items[i](class_='lii_price')[0].text
b = a[:len(a)-6]
product_price = b[4:len(b)]
hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest()
if hash_title in products_set:
continue
products_set.add(hash_title)
product_info =\
{
'product_img_url':product_img_url,
'product_url':product_url,
'product_title':product_title,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
#print(len(products_list))</code></pre>
<pre><code class="language-python">len(products_list)</code></pre>
<pre><code>40</code></pre>
<pre><code class="language-python">products_list[39]</code></pre>
<pre><code>{'product_img_url': '//img.alicdn.com/bao/uploaded/i3/2010197355/TB2L8gVxm8YBeNkSnb4XXaevFXa_!!2010197355.png_125x125Q50s50.jpg_.webp',
'product_price': '13590',
'product_shop_name': '孔雀翎海外专营店',
'product_title': 'Gucci/古奇2019新款特色GG双皮革手柄387102薄款公文手提男包',
'product_url': 'https://detail.tmall.com/item.htm?id=573909557695&skuId=3916716010809&pic=//img.alicdn.com/bao/uploaded/i3/2010197355/TB2L8gVxm8YBeNkSnb4XXaevFXa_!!2010197355.png_125x125Q50s50.jpg_.webp&itemTitle=Gucci/%E5%8F%A4%E5%A5%872019%E6%96%B0%E6%AC%BE%E7%89%B9%E8%89%B2GG%E5%8F%8C%E7%9A%AE%E9%9D%A9%E6%89%8B%E6%9F%84387102%E8%96%84%E6%AC%BE%E5%85%AC%E6%96%87%E6%89%8B%E6%8F%90%E7%94%B7%E5%8C%85&price=13590.00&from=h5'}</code></pre>
<pre><code class="language-python">i = 300</code></pre>
<p>7</p>
<pre><code class="language-python">js="var q=document.documentElement.scrollTop="+str(i)
browser.execute_script(js)
i = i+800</code></pre>
<pre><code class="language-python">women_bag_page_source1 = browser.page_source</code></pre>
<pre><code class="language-python">from bs4 import BeautifulSoup</code></pre>
<pre><code class="language-python">red_soup = BeautifulSoup(women_bag_page_source1, 'lxml')</code></pre>
<pre><code class="language-python">len(products_list)</code></pre>
<pre><code>35</code></pre>
<pre><code class="language-python">products_list = []
list_items = red_soup.find_all(class_='list_item')
for i in range(0,len(list_items)):
product_img_url = list_items[i](class_='li_img_wap')[0].img['src']
product_title = list_items[i](class_='li_img_wap')[0].img['alt']
product_shop_name = list_items[i](class_='lii_sold')[0].span.text
product_sold_info = list_items[i](class_='lii_sold')[0].text
product_url = 'https:' +list_items[i]['href']
a = list_items[i](class_='lii_price')[0].text
b = a[:len(a)-6]
product_price = b[4:len(b)]
product_info =\
{
'product_img_url':product_img_url,
'product_url':product_url,
'product_title':product_title,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
</code></pre>
<pre><code class="language-python">product_title = list_items[0](class_='li_img_wap')[0].img['alt']</code></pre>
<pre><code>'gucci黑色绒面系列复古单肩包'</code></pre>
<pre><code class="language-python">len_list = len(products_list)
products_list[len_list-1]['product_price']</code></pre>
<pre><code>'9200'</code></pre>
<pre><code class="language-python">import time</code></pre>
<pre><code class="language-python">for m in range(1,58):
len_list = len(products_list)
url_i = 'https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price='
url_o = products_list[len_list-1]['product_price']
url = url_i + url_o
print(url)
browser.get(url)
time.sleep(3)
red_soup = BeautifulSoup(browser.page_source, 'lxml')
list_items = red_soup.find_all(class_='list_item')
for i in range(0,len(list_items)):
product_img_url = list_items[i](class_='li_img_wap')[0].img['src']
product_title = list_items[i](class_='li_img_wap')[0].img['alt']
product_shop_name = list_items[i](class_='lii_sold')[0].span.text
product_sold_info = list_items[i](class_='lii_sold')[0].text
product_url = 'https:' +list_items[i]['href']
a = list_items[i](class_='lii_price')[0].text
b = a[:len(a)-6]
product_price = b[4:len(b)]
hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest()
if hash_title in products_set:
continue
products_set.add(hash_title)
product_info =\
{
'product_img_url':product_img_url,
'product_url':product_url,
'product_title':product_title,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
i = 300
for j in range(1,11):
time.sleep(2)
js="var q=document.documentElement.scrollTop="+str(i)
browser.execute_script(js)
i = i+800
red_soup = BeautifulSoup(browser.page_source, 'lxml')
list_items = red_soup.find_all(class_='list_item')
for i in range(0,len(list_items)):
product_img_url = list_items[i](class_='li_img_wap')[0].img['src']
product_title = list_items[i](class_='li_img_wap')[0].img['alt']
product_shop_name = list_items[i](class_='lii_sold')[0].span.text
product_sold_info = list_items[i](class_='lii_sold')[0].text
product_url = 'https:' +list_items[i]['href']
a = list_items[i](class_='lii_price')[0].text
b = a[:len(a)-6]
product_price = b[4:len(b)]
hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest()
if hash_title in products_set:
continue
products_set.add(hash_title)
product_info =\
{
'product_img_url':product_img_url,
'product_url':product_url,
'product_title':product_title,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
print(len(products_list))</code></pre>
<pre><code>https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=1888
3112
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=1680
3148
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=1299
3184
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=1199
3196
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=798
3196
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=798
3196
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=798
3196
https://list.tmall.com/search_product.htm?q=gucci&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_searchbutton&sort=pd&style=list&end_price=798
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-132-3927d1de9935> in <module>()
39 i = 300
40 for j in range(1,11):
---> 41 time.sleep(2)
42 js="var q=document.documentElement.scrollTop="+str(i)
43 browser.execute_script(js)
KeyboardInterrupt: </code></pre>
<pre><code class="language-python">m</code></pre>
<pre><code>29</code></pre>
<pre><code class="language-python">url_i = 'https://list.tmall.com/search_product.htm?q=gucci+%C5%AE%B0%FC&type=p&spm=875.7931836/B.a2227oh.d100&xl=Gucci_1&from=mallfp..pc_1_suggest&sort=pd&cat=54294290&end_price='
url_o = '39999'
url = url_i + url_o
browser.get(url)
i = 300
for j in range(1,12):
time.sleep(2)
js="var q=document.documentElement.scrollTop="+str(i)
browser.execute_script(js)
i = i+800
red_soup = BeautifulSoup(browser.page_source, 'lxml')
list_items = red_soup.find_all(class_='list_item')
for i in range(0,len(list_items)):
product_img_url = list_items[i](class_='li_img_wap')[0].img['src']
product_title = list_items[i](class_='li_img_wap')[0].img['alt']
product_shop_name = list_items[i](class_='lii_sold')[0].span.text
product_sold_info = list_items[i](class_='lii_sold')[0].text
product_url = 'https:' +list_items[i]['href']
a = list_items[i](class_='lii_price')[0].text
b = a[:len(a)-6]
product_price = b[4:len(b)]
product_info =\
{
'product_img_url':product_img_url,
'product_url':product_url,
'product_title':product_title,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
print(len(products_list))</code></pre>
<pre><code>989</code></pre>
<pre><code class="language-python">import pickle</code></pre>
<pre><code class="language-python">pickle.dump(products_list, open('all_products_list.txt', 'wb'))</code></pre>
<pre><code class="language-python">products_list[200]</code></pre>
<pre><code>{'product_img_url': '//img.alicdn.com/bao/uploaded/i2/2529360066/O1CN011CMGbxl2nGiNKKf_!!2529360066.jpg_125x125Q50s50.jpg_.webp',
'product_price': '16990',
'product_shop_name': '欧瑟旗舰店',
'product_title': 'GUCCI/古奇古驰 Marmont系列2018新款女士单肩链条包 443496',
'product_url': 'https://detail.tmall.com/item.htm?id=580435674127&skuId=4033355650932&pic=//img.alicdn.com/bao/uploaded/i2/2529360066/O1CN011CMGbxl2nGiNKKf_!!2529360066.jpg_125x125Q50s50.jpg_.webp&itemTitle=GUCCI/%E5%8F%A4%E5%A5%87%E5%8F%A4%E9%A9%B0%20Marmont%E7%B3%BB%E5%88%972018%E6%96%B0%E6%AC%BE%E5%A5%B3%E5%A3%AB%E5%8D%95%E8%82%A9%E9%93%BE%E6%9D%A1%E5%8C%85%20443496&price=16990.00&from=h5'}</code></pre>