python


12、天猫电脑版,不好用

<pre><code class="language-python">from selenium import webdriver browser = webdriver.Chrome()</code></pre> <pre><code class="language-python">url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.4f87c24ed9bpjY&amp;s=720&amp;q=gucci&amp;sort=s&amp;style=g&amp;from=mallfp..pc_1_searchbutton&amp;type=pc#J_Filter'</code></pre> <pre><code class="language-python">browser.get(url)</code></pre> <pre><code class="language-python">page_source1 = browser.page_source</code></pre> <pre><code class="language-python">page_source2 = browser.page_source</code></pre> <pre><code class="language-python">page_source3 = browser.page_source</code></pre> <pre><code class="language-python">page_source4 = browser.page_source</code></pre> <pre><code class="language-python">page_source5 = browser.page_source</code></pre> <pre><code class="language-python">page_source6 = browser.page_source</code></pre> <pre><code class="language-python">page_source7 = browser.page_source</code></pre> <pre><code class="language-python">page_source8 = browser.page_source</code></pre> <pre><code class="language-python">page_source9 = browser.page_source</code></pre> <pre><code class="language-python">page_source10 = browser.page_source</code></pre> <pre><code class="language-python">page_source11 = browser.page_source</code></pre> <pre><code class="language-python">page_source12 = browser.page_source</code></pre> <pre><code class="language-python">import pickle</code></pre> <pre><code class="language-python">page_source = [page_source1,page_source2, page_source3, page_source4, page_source5, page_source6, page_source7, page_source8,page_source9,page_source10]</code></pre> <pre><code class="language-python">len(page_source)</code></pre> <pre><code>10</code></pre> <pre><code class="language-python">for i in range(0, 8): pickle.dump(page_source[i], open('./page_source/page_source' + str(i+1) + '.txt', 'wb'))</code></pre> <pre><code class="language-python">page_source9 = browser.page_source</code></pre> <pre><code class="language-python">red_page_source = pickle.load(open('./page_source/page_source1.txt'', 'rb'))</code></pre> <pre><code> File "&lt;ipython-input-24-e06e32d2ed76&gt;", line 1 red_page_source = pickle.load(open('./page_source/page_source1.txt'', 'rb')) ^ SyntaxError: EOL while scanning string literal</code></pre> <pre><code class="language-python">from bs4 import BeautifulSoup</code></pre> <pre><code class="language-python">red_soup = BeautifulSoup(page_source1, 'lxml')</code></pre> <pre><code class="language-python">product = red_soup.find_all(class_='product')</code></pre> <pre><code class="language-python">len(product)</code></pre> <pre><code>60</code></pre> <pre><code class="language-python">products_list = [] for i in range(0, 10): red_soup = BeautifulSoup(page_source[i], 'lxml') products = red_soup.find_all(class_='product') # product_titles = red_soup.find_all(class_='productTitle') # product_shop_names = red_soup.find_all(class_='productShop-name') # product_imgs = red_soup.find_all(class_='productImg-wrap') # productStatus = red_soup.find_all(class_='productStatus') for j in range(0, len(product_titles)): try: product_img_url = 'https:' + products[j](class_='productImg')[0].img['data-ks-lazyload'] except Exception as e: product_img_url = '' try: product_img_url_2 = 'https:' + products[j](class_='proThumb-img ')[0].img['src'] except Exception as e: product_img_url_2 = '' product_url = 'https:' + products[j](class_='productImg')[0]['href'] product_shop_name = products[j](class_='productShop-name')[0].text product_price = products[j](class_='productPrice')[0].em['title'] product_title = products[j](class_='productTitle')[0].a['title'] product_statue = products[20](class_='productStatus')[0].span.em.text product_statue_2 = products[20](class_='productStatus')[0].text # product_img_url = 'https:' + product_imgs[j].img['src'] # product_url = 'https:' + product_imgs[j].a['href'] # product_shop_name = product_shop_names[j].text # product_price = product_prices[j].em['title'] # product_title = product_titles[j].a['title'] # product_statue = productStatus[j].span.em.text # product_statue_2 = productStatus[j].text product_info =\ { 'product_img_url':product_img_url, 'product_img_url_2':product_img_url_2, 'product_url':product_url, 'product_title':product_title, 'product_statue_2':product_statue_2, 'product_statue':product_statue, 'product_shop_name':product_shop_name, 'product_price':product_price } products_list.append(product_info) print(i)</code></pre> <pre><code>0 1 2 3 4 5 6 7 8 9</code></pre> <pre><code class="language-python">pickle.dump(products_list, open('products_list.txt', 'wb'))</code></pre> <pre><code class="language-python"></code></pre> <pre><code class="language-python">products_list[60]</code></pre> <pre><code>{'product_img_url': '', 'product_img_url_2': 'https://img.alicdn.com/bao/uploaded/i3/2658829235/O1CN01nOmoeP2I5gAB6niLN_!!2658829235.jpg_30x30.jpg', 'product_url': 'https://detail.tmall.com/item.htm?id=569219493340&amp;skuId=4151277302125&amp;areaId=320100&amp;user_id=2658829235&amp;cat_id=2&amp;is_b=1&amp;rn=eb80caf7637dd8e61e96fb65b209745d', 'product_title': 'Gucci/古驰女士包袋 帆布配皮双G印花购物袋手提单肩包169946 NB', 'product_statue_2': '\n月成交 0笔\n评价 0\n旺旺在线\n', 'product_statue': '0笔', 'product_shop_name': '\n基范尼旗舰店\n', 'product_price': '5360.00'}</code></pre> <pre><code class="language-python"></code></pre> <pre><code class="language-python">red_soup = BeautifulSoup(page_source[1], 'lxml') </code></pre> <pre><code class="language-python">product_img_url = 'https:' + products[j](class_='productImg')[0].img['data-ks-lazyload'] product_url = 'https:' + products[j](class_='productImg')[0]['href'] product_shop_name = products[j](class_='productShop-name')[0].text product_price = products[j](class_='productPrice')[0].em['title'] product_title = products[j](class_='productTitle')[0].a['title'] product_statue = products[20](class_='productStatus')[0].span.em.text product_statue_2 = products[20](class_='productStatus')[0].text </code></pre> <pre><code class="language-python"></code></pre> <pre><code class="language-python">product_titles = red_soup.find_all(class_='productTitle') product_shop_names = red_soup.find_all(class_='productShop-name') product_imgs = red_soup.find_all(class_='productImg-wrap') productStatus = red_soup.find_all(class_='productStatus')</code></pre> <pre><code class="language-python">products = red_soup.find_all(class_='product')</code></pre> <pre><code class="language-python">products[20](class_='proThumb-img ')[0].img['src']</code></pre> <pre><code>'//img.alicdn.com/bao/uploaded/i1/2037432060/TB2bB_hajnD8KJjSspbXXbbEXXa_!!2037432060.jpg_30x30.jpg'</code></pre> <pre><code class="language-python"></code></pre> <pre><code class="language-python"></code></pre> <pre><code class="language-python">product_titles[20]</code></pre> <pre><code>&lt;p class="productTitle"&gt; &lt;a data-p="21-11" href="//detail.tmall.com/item.htm?id=585327596695&amp;amp;skuId=3966187536299&amp;amp;areaId=320100&amp;amp;user_id=1992682817&amp;amp;cat_id=2&amp;amp;is_b=1&amp;amp;rn=152419b0947dd8d33d8d7729eec132aa" target="_blank" title="现货Gucci/古驰男士钱包老虎印花短款对折钱夹皮夹451268 K5Z1N"&gt; 现货&lt;span class="H"&gt;Gucci&lt;/span&gt;/&lt;span class="H"&gt;古驰&lt;/span&gt;&lt;span class="H"&gt;男&lt;/span&gt;士钱&lt;span class="H"&gt;包&lt;/span&gt;老虎印花短款对折钱夹皮夹451268 K5Z1N &lt;/a&gt; &lt;/p&gt;</code></pre> <pre><code class="language-python"></code></pre> <pre><code class="language-python">len(products_list)</code></pre> <pre><code>600</code></pre> <pre><code class="language-python">len(products_list)</code></pre> <pre><code>8</code></pre> <pre><code class="language-python">len(products_list[7])</code></pre> <pre><code>60</code></pre> <pre><code class="language-python">pickle.dump(products_list, open('./page_source/page_source1_8.txt', 'wb'))</code></pre> <pre><code>--------------------------------------------------------------------------- RecursionError Traceback (most recent call last) &lt;ipython-input-34-9fd4fb6d5b41&gt; in &lt;module&gt;() ----&gt; 1 pickle.dump(products_list, open('./page_source/page_source1_8.txt', 'wb')) ~/anaconda3/lib/python3.6/site-packages/bs4/element.py in __getnewargs__(self) 724 725 def __getnewargs__(self): --&gt; 726 return (str(self),) 727 728 def __getattr__(self, attr): RecursionError: maximum recursion depth exceeded while getting the str of an object</code></pre> <pre><code class="language-python">pickle.dump(products_list[0][0], open('./page_source/page_source1_1.txt', 'wb'))</code></pre> <pre><code>--------------------------------------------------------------------------- RecursionError Traceback (most recent call last) &lt;ipython-input-36-fae09917fb6f&gt; in &lt;module&gt;() ----&gt; 1 pickle.dump(products_list[0][0], open('./page_source/page_source1_1.txt', 'wb')) RecursionError: maximum recursion depth exceeded while pickling an object</code></pre> <pre><code class="language-python">product_imgs = red_soup.find_all(class_='productImg-wrap')</code></pre> <pre><code class="language-python">len(product_imgs)</code></pre> <pre><code>60</code></pre> <pre><code class="language-python">product_imgs[0]</code></pre> <pre><code>&lt;div class="productImg-wrap"&gt; &lt;a class="productImg" data-p="1-10" href="//detail.tmall.com/item.htm?id=576930907699&amp;amp;skuId=3967125822152&amp;amp;areaId=320100&amp;amp;user_id=1879766137&amp;amp;cat_id=2&amp;amp;is_b=1&amp;amp;rn=9a0913e9570137e83dc5fc989a5aef19" target="_blank"&gt; &lt;img src="//img.alicdn.com/bao/uploaded/i4/1879766137/TB2FnYlvyMnBKNjSZFoXXbOSFXa_!!1879766137.jpg"/&gt; &lt;/a&gt; &lt;/div&gt;</code></pre> <pre><code class="language-python">product_img_url = 'https:' + product_imgs[0].img['src'] product_img_url</code></pre> <pre><code>'https://img.alicdn.com/bao/uploaded/i4/1879766137/TB2FnYlvyMnBKNjSZFoXXbOSFXa_!!1879766137.jpg'</code></pre> <pre><code class="language-python">product_url = 'https:' + product_imgs[0].a['href'] product_url</code></pre> <pre><code>'https://detail.tmall.com/item.htm?id=576930907699&amp;skuId=3967125822152&amp;areaId=320100&amp;user_id=1879766137&amp;cat_id=2&amp;is_b=1&amp;rn=9a0913e9570137e83dc5fc989a5aef19'</code></pre> <pre><code class="language-python">product_prices = red_soup.find_all(class_='productPrice')</code></pre> <pre><code class="language-python">product_prices[0]</code></pre> <pre><code>&lt;p class="productPrice"&gt; &lt;em title="11735.00"&gt;&lt;b&gt;¥&lt;/b&gt;11735.00&lt;/em&gt; &lt;/p&gt;</code></pre> <pre><code class="language-python">product_prices[0].em['title']</code></pre> <pre><code>'11735.00'</code></pre> <pre><code class="language-python">product_price = product_prices[0].em['title']</code></pre> <pre><code class="language-python">product_shop_names = red_soup.find_all(class_='productShop-name')</code></pre> <pre><code class="language-python">product_shop_name = product_shop_names[0].text</code></pre> <pre><code class="language-python">product_shop_name</code></pre> <pre><code>'\n集美海外专营店\n'</code></pre> <pre><code class="language-python">product_titles = red_soup.find_all(class_='productTitle')</code></pre> <pre><code class="language-python">product_titles[0]</code></pre> <pre><code>&lt;p class="productTitle"&gt; &lt;a data-p="1-11" href="//detail.tmall.com/item.htm?id=576930907699&amp;amp;skuId=3967125822152&amp;amp;areaId=320100&amp;amp;user_id=1879766137&amp;amp;cat_id=2&amp;amp;is_b=1&amp;amp;rn=9a0913e9570137e83dc5fc989a5aef19" target="_blank" title="Gucci 古奇 新款黑色牛皮双G压纹拉链磁扣胸包腰包男495450"&gt; &lt;span class="H"&gt;Gucci&lt;/span&gt; 古奇 新款黑色牛皮双G压纹拉链磁扣胸&lt;span class="H"&gt;包&lt;/span&gt;腰&lt;span class="H"&gt;包&lt;/span&gt;&lt;span class="H"&gt;男&lt;/span&gt;495450 &lt;/a&gt; &lt;/p&gt;</code></pre> <pre><code class="language-python">product_title = product_titles[0].a['title']</code></pre> <pre><code class="language-python">productStatus = red_soup.find_all(class_='productStatus')</code></pre> <pre><code class="language-python">productStatus[0]</code></pre> <pre><code>&lt;p class="productStatus"&gt; &lt;span&gt;月成交 &lt;em&gt;0笔&lt;/em&gt;&lt;/span&gt; &lt;span&gt;评价 &lt;a data-p="1-1" href="//detail.tmall.com/item.htm?id=576930907699&amp;amp;skuId=3967125822152&amp;amp;areaId=320100&amp;amp;user_id=1879766137&amp;amp;cat_id=2&amp;amp;is_b=1&amp;amp;rn=9a0913e9570137e83dc5fc989a5aef19&amp;amp;on_comment=1#J_TabBar" target="_blank"&gt;0&lt;/a&gt;&lt;/span&gt; &lt;span class="ww-light ww-small" data-atp="a!1-2,,,,,,,1879766137" data-display="inline" data-icon="small" data-item="576930907699" data-nick="集美海外专营店" data-tnick="集美海外专营店"&gt;&lt;a class="ww-inline ww-online" href="https://amos.alicdn.com/getcid.aw?v=3&amp;amp;groupid=0&amp;amp;s=1&amp;amp;charset=utf-8&amp;amp;uid=%E9%9B%86%E7%BE%8E%E6%B5%B7%E5%A4%96%E4%B8%93%E8%90%A5%E5%BA%97&amp;amp;site=cntaobao&amp;amp;fromid=cntaobao张超然然然" target="_blank" title="点此可以直接和卖家交流选好的宝贝,或相互交流网购体验,还支持语音视频噢。"&gt;&lt;span&gt;旺旺在线&lt;/span&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</code></pre> <pre><code class="language-python">productStatus[0].text</code></pre> <pre><code>'\n月成交 0笔\n评价 0\n旺旺在线\n'</code></pre> <pre><code class="language-python">product_titles = red_soup.find_all(class_='productTitle') product_shop_names = red_soup.find_all(class_='productShop-name') product_imgs = red_soup.find_all(class_='productImg-wrap') productStatus = red_soup.find_all(class_='productStatus') product_img_url = 'https:' + product_imgs[0].img['src'] product_url = 'https:' + product_imgs[0].a['href'] product_shop_name = product_shop_names[0].text product_price = product_prices[0].em['title'] product_title = product_titles[0].a['title'] product_statue = productStatus[0].span.em.text product_statue_2 = productStatus[0].text</code></pre>

页面列表

ITEM_HTML