python


4、jd_with_image

<pre><code> from bs4 import BeautifulSoup from selenium import webdriver import time import hashlib import os import pickle import random import xlsxwriter import urllib.request from multiprocessing import Pool import socket socket.setdefaulttimeout(5) def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False def function(date): return date['product_price'] def openUrl(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read() print(html) Soup = BeautifulSoup(html, 'lxml') return Soup style_list = [ '男钱包', '女钱包', '男包','女包', '男鞋', '女鞋','男装', '女装', '男腰带','女腰带', '男围巾', '女围巾' ] it_goods_i_id = 0 work_path = './jingdong/' def makdir_img_path(): for download_type in style_list: it_goods_i_id = 0 product_info_list = [] img_save_path = './jingdong/'+ download_type + '/img/' mkdir(img_save_path) jd_list = pickle.load(open('./jingdong/'+ download_type + '.txt', 'rb')) for product in jd_list: product_title = product['product_title'] product_img_url = product['product_img_url'] product_url = product['product_url'] product_plust_price = product['product_plust_price'] product_activity_info = product['product_activity_info'] product_commit = product['product_commit'] product_shop_names = product['product_shop_names'] product_price = product['product_price'] product_img_url_path = img_save_path + str(it_goods_i_id) + '.jpg' it_goods_i_id = it_goods_i_id + 1 product_info = \ { 'product_img_url': product_img_url, 'product_url': product_url, 'product_title': product_title, 'product_plust_price': product_plust_price, 'product_activity_info': product_activity_info, 'product_commit': product_commit, 'product_shop_names': product_shop_names, 'product_img_url_path': product_img_url_path, 'product_price': product_price } print(product_img_url_path) print('******************') product_info_list.append(product_info) # 以价格排序 product_info_list.sort(key=function) pickle.dump(product_info_list, open(work_path + download_type + '/product_info_list.txt', 'wb')) def get_img_path(): for download_type in style_list: it_goods_i_id = 0 product_info_list = [] img_save_path = './jingdong/'+ download_type + '/img/' mkdir(img_save_path) jd_list = pickle.load(open('./jingdong/'+ download_type + '.txt', 'rb')) for product in jd_list: print('start') product_url = product['product_url'] product_title = product['product_title'] product_img_url = product['product_img_url'] if product_img_url =='': try: Soup = openUrl(product_url) except Exception as e: print(e) continue # print(Soup) try: product_imgs = Soup.find_all(class_="jqzoom main-img") # print(product_imgs) img_url_1 = product_imgs[0].img['data-origin'] # img_url_2 = product_imgs[0].img['data-url'] except Exception as e: print(e) continue print('************************************') print(img_url_1) # print(img_url_2) print('************************************') product_img_url = 'https:' + img_url_1 product_plust_price = product['product_plust_price'] product_activity_info = product['product_activity_info'] product_commit = product['product_commit'] product_shop_names = product['product_shop_names'] product_price = product['product_price'] product_img_url_path = img_save_path + str(it_goods_i_id) + '.jpg' it_goods_i_id = it_goods_i_id + 1 product_info = \ { 'product_img_url': product_img_url, 'product_url': product_url, 'product_title': product_title, 'product_plust_price': product_plust_price, 'product_activity_info': product_activity_info, 'product_commit': product_commit, 'product_shop_names': product_shop_names, 'product_img_url_path': product_img_url_path, 'product_price': product_price } print(product_img_url_path) print('******************') product_info_list.append(product_info) # 以价格排序 product_info_list.sort(key=function) pickle.dump(product_info_list, open(work_path + download_type + '/product_info_list.txt', 'wb')) class DownImgClass: """一个简单的类实例""" def __init__(self): self.imgurl_all_list = [] def downUrl(self, url, path_name): urllib.request.urlretrieve(url, path_name) print(path_name + ": success") return None def start_download(self): for i in range(len(style_list)): self.multi_download_img(style_list[i]) def multi_download_img(self, download_type): # ''' self.imgurl_all_list = [] # 读取变量 product_info_list = pickle.load(open(work_path + download_type + '/product_info_list.txt', 'rb')) print('read') for product in product_info_list: img_url = product['product_img_url'] if img_url == '': continue img_path = product['product_img_url_path'] img_url_dict = {'img_url': img_url, 'img_path': img_path} self.imgurl_all_list.append(img_url_dict) print(len(self.imgurl_all_list)) print('down2') pool = Pool() pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))]) pool.close() pool.join() print('down1') def second_multi_download_img(self, number): print(self.imgurl_all_list[number]) img_url = self.imgurl_all_list[number]['img_url'] url_path = self.imgurl_all_list[number]['img_path'] print('down') try: self.downUrl(img_url, url_path) except Exception as e: print(e) if __name__ == "__main__": # get_img_path() # makdir_img_path() a = time.time() # 获取信息 c = time.time() d = a - c # 下载图片 down_img = DownImgClass() down_img.start_download() b = time.time() # 输出总耗时 print((b - a) / 60) print(d / 60) </code></pre>

页面列表

ITEM_HTML