12、天猫电脑版,不好用
<pre><code class="language-python">from selenium import webdriver
browser = webdriver.Chrome()</code></pre>
<pre><code class="language-python">url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.4f87c24ed9bpjY&s=720&q=gucci&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'</code></pre>
<pre><code class="language-python">browser.get(url)</code></pre>
<pre><code class="language-python">page_source1 = browser.page_source</code></pre>
<pre><code class="language-python">page_source2 = browser.page_source</code></pre>
<pre><code class="language-python">page_source3 = browser.page_source</code></pre>
<pre><code class="language-python">page_source4 = browser.page_source</code></pre>
<pre><code class="language-python">page_source5 = browser.page_source</code></pre>
<pre><code class="language-python">page_source6 = browser.page_source</code></pre>
<pre><code class="language-python">page_source7 = browser.page_source</code></pre>
<pre><code class="language-python">page_source8 = browser.page_source</code></pre>
<pre><code class="language-python">page_source9 = browser.page_source</code></pre>
<pre><code class="language-python">page_source10 = browser.page_source</code></pre>
<pre><code class="language-python">page_source11 = browser.page_source</code></pre>
<pre><code class="language-python">page_source12 = browser.page_source</code></pre>
<pre><code class="language-python">import pickle</code></pre>
<pre><code class="language-python">page_source = [page_source1,page_source2, page_source3, page_source4, page_source5, page_source6, page_source7, page_source8,page_source9,page_source10]</code></pre>
<pre><code class="language-python">len(page_source)</code></pre>
<pre><code>10</code></pre>
<pre><code class="language-python">for i in range(0, 8):
pickle.dump(page_source[i], open('./page_source/page_source' + str(i+1) + '.txt', 'wb'))</code></pre>
<pre><code class="language-python">page_source9 = browser.page_source</code></pre>
<pre><code class="language-python">red_page_source = pickle.load(open('./page_source/page_source1.txt'', 'rb'))</code></pre>
<pre><code> File "<ipython-input-24-e06e32d2ed76>", line 1
red_page_source = pickle.load(open('./page_source/page_source1.txt'', 'rb'))
^
SyntaxError: EOL while scanning string literal</code></pre>
<pre><code class="language-python">from bs4 import BeautifulSoup</code></pre>
<pre><code class="language-python">red_soup = BeautifulSoup(page_source1, 'lxml')</code></pre>
<pre><code class="language-python">product = red_soup.find_all(class_='product')</code></pre>
<pre><code class="language-python">len(product)</code></pre>
<pre><code>60</code></pre>
<pre><code class="language-python">products_list = []
for i in range(0, 10):
red_soup = BeautifulSoup(page_source[i], 'lxml')
products = red_soup.find_all(class_='product')
# product_titles = red_soup.find_all(class_='productTitle')
# product_shop_names = red_soup.find_all(class_='productShop-name')
# product_imgs = red_soup.find_all(class_='productImg-wrap')
# productStatus = red_soup.find_all(class_='productStatus')
for j in range(0, len(product_titles)):
try:
product_img_url = 'https:' + products[j](class_='productImg')[0].img['data-ks-lazyload']
except Exception as e:
product_img_url = ''
try:
product_img_url_2 = 'https:' + products[j](class_='proThumb-img ')[0].img['src']
except Exception as e:
product_img_url_2 = ''
product_url = 'https:' + products[j](class_='productImg')[0]['href']
product_shop_name = products[j](class_='productShop-name')[0].text
product_price = products[j](class_='productPrice')[0].em['title']
product_title = products[j](class_='productTitle')[0].a['title']
product_statue = products[20](class_='productStatus')[0].span.em.text
product_statue_2 = products[20](class_='productStatus')[0].text
# product_img_url = 'https:' + product_imgs[j].img['src']
# product_url = 'https:' + product_imgs[j].a['href']
# product_shop_name = product_shop_names[j].text
# product_price = product_prices[j].em['title']
# product_title = product_titles[j].a['title']
# product_statue = productStatus[j].span.em.text
# product_statue_2 = productStatus[j].text
product_info =\
{
'product_img_url':product_img_url,
'product_img_url_2':product_img_url_2,
'product_url':product_url,
'product_title':product_title,
'product_statue_2':product_statue_2,
'product_statue':product_statue,
'product_shop_name':product_shop_name,
'product_price':product_price
}
products_list.append(product_info)
print(i)</code></pre>
<pre><code>0
1
2
3
4
5
6
7
8
9</code></pre>
<pre><code class="language-python">pickle.dump(products_list, open('products_list.txt', 'wb'))</code></pre>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python">products_list[60]</code></pre>
<pre><code>{'product_img_url': '',
'product_img_url_2': 'https://img.alicdn.com/bao/uploaded/i3/2658829235/O1CN01nOmoeP2I5gAB6niLN_!!2658829235.jpg_30x30.jpg',
'product_url': 'https://detail.tmall.com/item.htm?id=569219493340&skuId=4151277302125&areaId=320100&user_id=2658829235&cat_id=2&is_b=1&rn=eb80caf7637dd8e61e96fb65b209745d',
'product_title': 'Gucci/古驰女士包袋 帆布配皮双G印花购物袋手提单肩包169946 NB',
'product_statue_2': '\n月成交 0笔\n评价 0\n旺旺在线\n',
'product_statue': '0笔',
'product_shop_name': '\n基范尼旗舰店\n',
'product_price': '5360.00'}</code></pre>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python">red_soup = BeautifulSoup(page_source[1], 'lxml')
</code></pre>
<pre><code class="language-python">product_img_url = 'https:' + products[j](class_='productImg')[0].img['data-ks-lazyload']
product_url = 'https:' + products[j](class_='productImg')[0]['href']
product_shop_name = products[j](class_='productShop-name')[0].text
product_price = products[j](class_='productPrice')[0].em['title']
product_title = products[j](class_='productTitle')[0].a['title']
product_statue = products[20](class_='productStatus')[0].span.em.text
product_statue_2 = products[20](class_='productStatus')[0].text
</code></pre>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python">product_titles = red_soup.find_all(class_='productTitle')
product_shop_names = red_soup.find_all(class_='productShop-name')
product_imgs = red_soup.find_all(class_='productImg-wrap')
productStatus = red_soup.find_all(class_='productStatus')</code></pre>
<pre><code class="language-python">products = red_soup.find_all(class_='product')</code></pre>
<pre><code class="language-python">products[20](class_='proThumb-img ')[0].img['src']</code></pre>
<pre><code>'//img.alicdn.com/bao/uploaded/i1/2037432060/TB2bB_hajnD8KJjSspbXXbbEXXa_!!2037432060.jpg_30x30.jpg'</code></pre>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python">product_titles[20]</code></pre>
<pre><code><p class="productTitle">
<a data-p="21-11" href="//detail.tmall.com/item.htm?id=585327596695&amp;skuId=3966187536299&amp;areaId=320100&amp;user_id=1992682817&amp;cat_id=2&amp;is_b=1&amp;rn=152419b0947dd8d33d8d7729eec132aa" target="_blank" title="现货Gucci/古驰男士钱包老虎印花短款对折钱夹皮夹451268 K5Z1N">
现货<span class="H">Gucci</span>/<span class="H">古驰</span><span class="H">男</span>士钱<span class="H">包</span>老虎印花短款对折钱夹皮夹451268 K5Z1N
</a>
</p></code></pre>
<pre><code class="language-python"></code></pre>
<pre><code class="language-python">len(products_list)</code></pre>
<pre><code>600</code></pre>
<pre><code class="language-python">len(products_list)</code></pre>
<pre><code>8</code></pre>
<pre><code class="language-python">len(products_list[7])</code></pre>
<pre><code>60</code></pre>
<pre><code class="language-python">pickle.dump(products_list, open('./page_source/page_source1_8.txt', 'wb'))</code></pre>
<pre><code>---------------------------------------------------------------------------
RecursionError Traceback (most recent call last)
<ipython-input-34-9fd4fb6d5b41> in <module>()
----> 1 pickle.dump(products_list, open('./page_source/page_source1_8.txt', 'wb'))
~/anaconda3/lib/python3.6/site-packages/bs4/element.py in __getnewargs__(self)
724
725 def __getnewargs__(self):
--> 726 return (str(self),)
727
728 def __getattr__(self, attr):
RecursionError: maximum recursion depth exceeded while getting the str of an object</code></pre>
<pre><code class="language-python">pickle.dump(products_list[0][0], open('./page_source/page_source1_1.txt', 'wb'))</code></pre>
<pre><code>---------------------------------------------------------------------------
RecursionError Traceback (most recent call last)
<ipython-input-36-fae09917fb6f> in <module>()
----> 1 pickle.dump(products_list[0][0], open('./page_source/page_source1_1.txt', 'wb'))
RecursionError: maximum recursion depth exceeded while pickling an object</code></pre>
<pre><code class="language-python">product_imgs = red_soup.find_all(class_='productImg-wrap')</code></pre>
<pre><code class="language-python">len(product_imgs)</code></pre>
<pre><code>60</code></pre>
<pre><code class="language-python">product_imgs[0]</code></pre>
<pre><code><div class="productImg-wrap">
<a class="productImg" data-p="1-10" href="//detail.tmall.com/item.htm?id=576930907699&amp;skuId=3967125822152&amp;areaId=320100&amp;user_id=1879766137&amp;cat_id=2&amp;is_b=1&amp;rn=9a0913e9570137e83dc5fc989a5aef19" target="_blank">
<img src="//img.alicdn.com/bao/uploaded/i4/1879766137/TB2FnYlvyMnBKNjSZFoXXbOSFXa_!!1879766137.jpg"/>
</a>
</div></code></pre>
<pre><code class="language-python">product_img_url = 'https:' + product_imgs[0].img['src']
product_img_url</code></pre>
<pre><code>'https://img.alicdn.com/bao/uploaded/i4/1879766137/TB2FnYlvyMnBKNjSZFoXXbOSFXa_!!1879766137.jpg'</code></pre>
<pre><code class="language-python">product_url = 'https:' + product_imgs[0].a['href']
product_url</code></pre>
<pre><code>'https://detail.tmall.com/item.htm?id=576930907699&skuId=3967125822152&areaId=320100&user_id=1879766137&cat_id=2&is_b=1&rn=9a0913e9570137e83dc5fc989a5aef19'</code></pre>
<pre><code class="language-python">product_prices = red_soup.find_all(class_='productPrice')</code></pre>
<pre><code class="language-python">product_prices[0]</code></pre>
<pre><code><p class="productPrice">
<em title="11735.00"><b>¥</b>11735.00</em>
</p></code></pre>
<pre><code class="language-python">product_prices[0].em['title']</code></pre>
<pre><code>'11735.00'</code></pre>
<pre><code class="language-python">product_price = product_prices[0].em['title']</code></pre>
<pre><code class="language-python">product_shop_names = red_soup.find_all(class_='productShop-name')</code></pre>
<pre><code class="language-python">product_shop_name = product_shop_names[0].text</code></pre>
<pre><code class="language-python">product_shop_name</code></pre>
<pre><code>'\n集美海外专营店\n'</code></pre>
<pre><code class="language-python">product_titles = red_soup.find_all(class_='productTitle')</code></pre>
<pre><code class="language-python">product_titles[0]</code></pre>
<pre><code><p class="productTitle">
<a data-p="1-11" href="//detail.tmall.com/item.htm?id=576930907699&amp;skuId=3967125822152&amp;areaId=320100&amp;user_id=1879766137&amp;cat_id=2&amp;is_b=1&amp;rn=9a0913e9570137e83dc5fc989a5aef19" target="_blank" title="Gucci 古奇 新款黑色牛皮双G压纹拉链磁扣胸包腰包男495450">
<span class="H">Gucci</span> 古奇 新款黑色牛皮双G压纹拉链磁扣胸<span class="H">包</span>腰<span class="H">包</span><span class="H">男</span>495450
</a>
</p></code></pre>
<pre><code class="language-python">product_title = product_titles[0].a['title']</code></pre>
<pre><code class="language-python">productStatus = red_soup.find_all(class_='productStatus')</code></pre>
<pre><code class="language-python">productStatus[0]</code></pre>
<pre><code><p class="productStatus">
<span>月成交 <em>0笔</em></span>
<span>评价 <a data-p="1-1" href="//detail.tmall.com/item.htm?id=576930907699&amp;skuId=3967125822152&amp;areaId=320100&amp;user_id=1879766137&amp;cat_id=2&amp;is_b=1&amp;rn=9a0913e9570137e83dc5fc989a5aef19&amp;on_comment=1#J_TabBar" target="_blank">0</a></span>
<span class="ww-light ww-small" data-atp="a!1-2,,,,,,,1879766137" data-display="inline" data-icon="small" data-item="576930907699" data-nick="集美海外专营店" data-tnick="集美海外专营店"><a class="ww-inline ww-online" href="https://amos.alicdn.com/getcid.aw?v=3&amp;groupid=0&amp;s=1&amp;charset=utf-8&amp;uid=%E9%9B%86%E7%BE%8E%E6%B5%B7%E5%A4%96%E4%B8%93%E8%90%A5%E5%BA%97&amp;site=cntaobao&amp;fromid=cntaobao张超然然然" target="_blank" title="点此可以直接和卖家交流选好的宝贝,或相互交流网购体验,还支持语音视频噢。"><span>旺旺在线</span></a></span>
</p></code></pre>
<pre><code class="language-python">productStatus[0].text</code></pre>
<pre><code>'\n月成交 0笔\n评价 0\n旺旺在线\n'</code></pre>
<pre><code class="language-python">product_titles = red_soup.find_all(class_='productTitle')
product_shop_names = red_soup.find_all(class_='productShop-name')
product_imgs = red_soup.find_all(class_='productImg-wrap')
productStatus = red_soup.find_all(class_='productStatus')
product_img_url = 'https:' + product_imgs[0].img['src']
product_url = 'https:' + product_imgs[0].a['href']
product_shop_name = product_shop_names[0].text
product_price = product_prices[0].em['title']
product_title = product_titles[0].a['title']
product_statue = productStatus[0].span.em.text
product_statue_2 = productStatus[0].text</code></pre>