python


11、爬天猫,使用手机版页面

<pre><code class="language-python">from selenium import webdriver # browser = webdriver.Chrome() option = webdriver.ChromeOptions() option.add_argument('--user-agent=iphone') browser = webdriver.Chrome(chrome_options=option) url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%B0%FC&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_suggest&amp;sort=pd&amp;style=tile&amp;cat=54294290&amp;end_price=25360' browser.get(url)</code></pre> <pre><code>C:\Users\zcr\.conda\envs\py36\lib\site-packages\ipykernel\__main__.py:6: DeprecationWarning: use options instead of chrome_options</code></pre> <pre><code class="language-python">men_bag_url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%B0%FC&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_suggest&amp;sort=pd&amp;style=tile&amp;cat=54294290&amp;end_price=25360' women_bag_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C5%AE%B0%FC&amp;type=p&amp;spm=875.7931836/B.a2227oh.d100&amp;xl=Gucci_1&amp;from=mallfp..pc_1_suggest&amp;sort=pd&amp;cat=54294290&amp;end_price=39999' men_shoes_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C4%D0%D0%AC&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;cat=54298748&amp;style=list&amp;end_price=24047' men_scarve_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C4%D0%CE%A7%BD%ED&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_suggest&amp;sort=pd&amp;style=list&amp;end_price=6830' women_scarve_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C5%AE%CE%A7%BD%ED&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=10970'</code></pre> <pre><code class="language-python">men_belt_url = 'https://list.tmall.com/search_product.htm?q=gucci+%C4%D0%D1%FC%B4%F8&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=9620'</code></pre> <pre><code class="language-python">women_belts_url = 'https://list.tmall.com/search_product.htm?q=gucci%C5%AE%D1%FC%B4%F8&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=12563'</code></pre> <pre><code class="language-python">men_clothes_url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%D7%B0&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=25480' women_clothes_url = 'https://list.tmall.com/search_product.htm?q=gucciŮװ&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=25940'</code></pre> <pre><code class="language-python">import hashlib</code></pre> <pre><code class="language-python">products_set = set() products_list = []</code></pre> <pre><code class="language-python">all_url = 'https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=39999'</code></pre> <pre><code class="language-python">#products_list = [] products_set = set() products_list = [] url = 'https://list.tmall.com/search_product.htm?q=gucci%C4%D0%B0%FC&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_suggest&amp;sort=pd&amp;style=list&amp;cat=54294290&amp;end_price=25360' url = all_url browser.get(url) time.sleep(3) red_soup = BeautifulSoup(browser.page_source, 'lxml') list_items = red_soup.find_all(class_='list_item') for i in range(0,len(list_items)): product_img_url = list_items[i](class_='li_img_wap')[0].img['src'] product_title = list_items[i](class_='li_img_wap')[0].img['alt'] product_shop_name = list_items[i](class_='lii_sold')[0].span.text product_sold_info = list_items[i](class_='lii_sold')[0].text product_url = 'https:' +list_items[i]['href'] a = list_items[i](class_='lii_price')[0].text b = a[:len(a)-6] product_price = b[4:len(b)] hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest() if hash_title in products_set: continue products_set.add(hash_title) product_info =\ { 'product_img_url':product_img_url, 'product_url':product_url, 'product_title':product_title, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) i = 300 for j in range(1,11): time.sleep(2) js="var q=document.documentElement.scrollTop="+str(i) browser.execute_script(js) i = i+800 red_soup = BeautifulSoup(browser.page_source, 'lxml') list_items = red_soup.find_all(class_='list_item') for i in range(0,len(list_items)): product_img_url = list_items[i](class_='li_img_wap')[0].img['src'] product_title = list_items[i](class_='li_img_wap')[0].img['alt'] product_shop_name = list_items[i](class_='lii_sold')[0].span.text product_sold_info = list_items[i](class_='lii_sold')[0].text product_url = 'https:' +list_items[i]['href'] a = list_items[i](class_='lii_price')[0].text b = a[:len(a)-6] product_price = b[4:len(b)] hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest() if hash_title in products_set: continue products_set.add(hash_title) product_info =\ { 'product_img_url':product_img_url, 'product_url':product_url, 'product_title':product_title, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) #print(len(products_list))</code></pre> <pre><code class="language-python">len(products_list)</code></pre> <pre><code>40</code></pre> <pre><code class="language-python">products_list[39]</code></pre> <pre><code>{'product_img_url': '//img.alicdn.com/bao/uploaded/i3/2010197355/TB2L8gVxm8YBeNkSnb4XXaevFXa_!!2010197355.png_125x125Q50s50.jpg_.webp', 'product_price': '13590', 'product_shop_name': '孔雀翎海外专营店', 'product_title': 'Gucci/古奇2019新款特色GG双皮革手柄387102薄款公文手提男包', 'product_url': 'https://detail.tmall.com/item.htm?id=573909557695&amp;skuId=3916716010809&amp;pic=//img.alicdn.com/bao/uploaded/i3/2010197355/TB2L8gVxm8YBeNkSnb4XXaevFXa_!!2010197355.png_125x125Q50s50.jpg_.webp&amp;itemTitle=Gucci/%E5%8F%A4%E5%A5%872019%E6%96%B0%E6%AC%BE%E7%89%B9%E8%89%B2GG%E5%8F%8C%E7%9A%AE%E9%9D%A9%E6%89%8B%E6%9F%84387102%E8%96%84%E6%AC%BE%E5%85%AC%E6%96%87%E6%89%8B%E6%8F%90%E7%94%B7%E5%8C%85&amp;price=13590.00&amp;from=h5'}</code></pre> <pre><code class="language-python">i = 300</code></pre> <p>7</p> <pre><code class="language-python">js="var q=document.documentElement.scrollTop="+str(i) browser.execute_script(js) i = i+800</code></pre> <pre><code class="language-python">women_bag_page_source1 = browser.page_source</code></pre> <pre><code class="language-python">from bs4 import BeautifulSoup</code></pre> <pre><code class="language-python">red_soup = BeautifulSoup(women_bag_page_source1, 'lxml')</code></pre> <pre><code class="language-python">len(products_list)</code></pre> <pre><code>35</code></pre> <pre><code class="language-python">products_list = [] list_items = red_soup.find_all(class_='list_item') for i in range(0,len(list_items)): product_img_url = list_items[i](class_='li_img_wap')[0].img['src'] product_title = list_items[i](class_='li_img_wap')[0].img['alt'] product_shop_name = list_items[i](class_='lii_sold')[0].span.text product_sold_info = list_items[i](class_='lii_sold')[0].text product_url = 'https:' +list_items[i]['href'] a = list_items[i](class_='lii_price')[0].text b = a[:len(a)-6] product_price = b[4:len(b)] product_info =\ { 'product_img_url':product_img_url, 'product_url':product_url, 'product_title':product_title, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) </code></pre> <pre><code class="language-python">product_title = list_items[0](class_='li_img_wap')[0].img['alt']</code></pre> <pre><code>'gucci黑色绒面系列复古单肩包'</code></pre> <pre><code class="language-python">len_list = len(products_list) products_list[len_list-1]['product_price']</code></pre> <pre><code>'9200'</code></pre> <pre><code class="language-python">import time</code></pre> <pre><code class="language-python">for m in range(1,58): len_list = len(products_list) url_i = 'https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=' url_o = products_list[len_list-1]['product_price'] url = url_i + url_o print(url) browser.get(url) time.sleep(3) red_soup = BeautifulSoup(browser.page_source, 'lxml') list_items = red_soup.find_all(class_='list_item') for i in range(0,len(list_items)): product_img_url = list_items[i](class_='li_img_wap')[0].img['src'] product_title = list_items[i](class_='li_img_wap')[0].img['alt'] product_shop_name = list_items[i](class_='lii_sold')[0].span.text product_sold_info = list_items[i](class_='lii_sold')[0].text product_url = 'https:' +list_items[i]['href'] a = list_items[i](class_='lii_price')[0].text b = a[:len(a)-6] product_price = b[4:len(b)] hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest() if hash_title in products_set: continue products_set.add(hash_title) product_info =\ { 'product_img_url':product_img_url, 'product_url':product_url, 'product_title':product_title, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) i = 300 for j in range(1,11): time.sleep(2) js="var q=document.documentElement.scrollTop="+str(i) browser.execute_script(js) i = i+800 red_soup = BeautifulSoup(browser.page_source, 'lxml') list_items = red_soup.find_all(class_='list_item') for i in range(0,len(list_items)): product_img_url = list_items[i](class_='li_img_wap')[0].img['src'] product_title = list_items[i](class_='li_img_wap')[0].img['alt'] product_shop_name = list_items[i](class_='lii_sold')[0].span.text product_sold_info = list_items[i](class_='lii_sold')[0].text product_url = 'https:' +list_items[i]['href'] a = list_items[i](class_='lii_price')[0].text b = a[:len(a)-6] product_price = b[4:len(b)] hash_title = hashlib.md5(product_title.encode(encoding='UTF-8')).hexdigest() if hash_title in products_set: continue products_set.add(hash_title) product_info =\ { 'product_img_url':product_img_url, 'product_url':product_url, 'product_title':product_title, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) print(len(products_list))</code></pre> <pre><code>https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=1888 3112 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=1680 3148 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=1299 3184 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=1199 3196 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=798 3196 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=798 3196 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=798 3196 https://list.tmall.com/search_product.htm?q=gucci&amp;type=p&amp;spm=a220m.6910245.a2227oh.d100&amp;from=mallfp..m_1_searchbutton&amp;sort=pd&amp;style=list&amp;end_price=798 --------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) &lt;ipython-input-132-3927d1de9935&gt; in &lt;module&gt;() 39 i = 300 40 for j in range(1,11): ---&gt; 41 time.sleep(2) 42 js="var q=document.documentElement.scrollTop="+str(i) 43 browser.execute_script(js) KeyboardInterrupt: </code></pre> <pre><code class="language-python">m</code></pre> <pre><code>29</code></pre> <pre><code class="language-python">url_i = 'https://list.tmall.com/search_product.htm?q=gucci+%C5%AE%B0%FC&amp;type=p&amp;spm=875.7931836/B.a2227oh.d100&amp;xl=Gucci_1&amp;from=mallfp..pc_1_suggest&amp;sort=pd&amp;cat=54294290&amp;end_price=' url_o = '39999' url = url_i + url_o browser.get(url) i = 300 for j in range(1,12): time.sleep(2) js="var q=document.documentElement.scrollTop="+str(i) browser.execute_script(js) i = i+800 red_soup = BeautifulSoup(browser.page_source, 'lxml') list_items = red_soup.find_all(class_='list_item') for i in range(0,len(list_items)): product_img_url = list_items[i](class_='li_img_wap')[0].img['src'] product_title = list_items[i](class_='li_img_wap')[0].img['alt'] product_shop_name = list_items[i](class_='lii_sold')[0].span.text product_sold_info = list_items[i](class_='lii_sold')[0].text product_url = 'https:' +list_items[i]['href'] a = list_items[i](class_='lii_price')[0].text b = a[:len(a)-6] product_price = b[4:len(b)] product_info =\ { 'product_img_url':product_img_url, 'product_url':product_url, 'product_title':product_title, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) print(len(products_list))</code></pre> <pre><code>989</code></pre> <pre><code class="language-python">import pickle</code></pre> <pre><code class="language-python">pickle.dump(products_list, open('all_products_list.txt', 'wb'))</code></pre> <pre><code class="language-python">products_list[200]</code></pre> <pre><code>{'product_img_url': '//img.alicdn.com/bao/uploaded/i2/2529360066/O1CN011CMGbxl2nGiNKKf_!!2529360066.jpg_125x125Q50s50.jpg_.webp', 'product_price': '16990', 'product_shop_name': '欧瑟旗舰店', 'product_title': 'GUCCI/古奇古驰 Marmont系列2018新款女士单肩链条包 443496', 'product_url': 'https://detail.tmall.com/item.htm?id=580435674127&amp;skuId=4033355650932&amp;pic=//img.alicdn.com/bao/uploaded/i2/2529360066/O1CN011CMGbxl2nGiNKKf_!!2529360066.jpg_125x125Q50s50.jpg_.webp&amp;itemTitle=GUCCI/%E5%8F%A4%E5%A5%87%E5%8F%A4%E9%A9%B0%20Marmont%E7%B3%BB%E5%88%972018%E6%96%B0%E6%AC%BE%E5%A5%B3%E5%A3%AB%E5%8D%95%E8%82%A9%E9%93%BE%E6%9D%A1%E5%8C%85%20443496&amp;price=16990.00&amp;from=h5'}</code></pre>

页面列表

ITEM_HTML