python


5、gucci_backup_1

<pre><code>from bs4 import BeautifulSoup import urllib.request # import pandas as pd import ssl import time import random import xlsxwriter import re import json import os import pickle import socket import sys from functools import partial from multiprocessing import Pool # from PyQt5.QtWidgets import QApplication, QWidget, QPushButton, QHBoxLayout, QTextEdit # 设置超时时间为30s socket.setdefaulttimeout(30) ssl._create_default_https_context = ssl._create_unverified_context def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False # 返回html的soup解析 def openUrl(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) Soup = BeautifulSoup(html, 'lxml') return Soup # 返回html的soup解析 def return_json(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) # Soup = BeautifulSoup(html, 'lxml') return html # 正则化str def re_str(str): return str.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') # 下载图片到本地 def downUrl(url, path_name): urllib.request.urlretrieve(url, path_name) print(path_name + ": success") return None def function(date): return date['product_prices'] def download_gucci_zh(download_type): # ************************************* # gucci_zh 获取时间戳 组装url 新建文件夹 now_time = time.time() new_now_time = int(round(now_time * 1000)) # 毫秒级时间戳 print(new_now_time) ##1544524915363 img_save_path = './gucci/' + download_type + '/gucci_zh_/' mkdir(img_save_path) # url_men_all_clothes = 'https://www.gucci.cn/zh/itemList?pn=2&amp;ni=97&amp;direction=down&amp;listName=ProductGrid&amp;_=1544277575004' # url = 'https://www.gucci.cn/zh/itemList?ni=16&amp;pn=3&amp;direction=down&amp;_=1544276231765' # belt_url = 'https://www.gucci.cn/zh/ca/men/accessories/belts?pn=1' # 'https://www.gucci.cn/zh/itemList?pn=2&amp;ni=97&amp;direction=down&amp;listName=ProductGrid&amp;_=1544627533425' # 领带65 men_all_clothes = 97 men_all_shoes = 17 men_all_bags = 16 men_all_belts = 64 men_all_wallets = 63 men_all_scarves = 66 women_all_clothes = 90 women_all_shoes = 13 women_all_bags = 12 women_all_belts = 41 women_all_wallets = 40 women_all_scarves = 42 zh_url_i = 'https://www.gucci.cn/zh/itemList?ni=' # '16&amp;pn='#'https://www.gucci.cn/zh/itemList?pn=' zh_url_o = '&amp;direction=down&amp;listName=ProductGrid&amp;_=' + str( new_now_time) # &amp;direction=down&amp;_= '&amp;ni=63&amp;direction=down&amp;listName=ProductGrid&amp;_=1544011708050' zh_url_ni = '' if download_type == 'men_all_shoes': zh_url_ni = men_all_shoes elif download_type == 'men_all_bags': zh_url_ni = men_all_bags elif download_type == 'men_all_clothes': zh_url_ni = men_all_clothes elif download_type == 'men_all_belts': zh_url_ni = men_all_belts elif download_type == 'men_all_wallets': zh_url_ni = men_all_wallets elif download_type == 'men_all_scarves': zh_url_ni = men_all_scarves elif download_type == 'women_all_shoes': zh_url_ni = women_all_shoes elif download_type == 'women_all_bags': zh_url_ni = women_all_bags elif download_type == 'women_all_clothes': zh_url_ni = women_all_clothes elif download_type == 'women_all_belts': zh_url_ni = women_all_belts elif download_type == 'women_all_wallets': zh_url_ni = women_all_wallets elif download_type == 'women_all_scarves': zh_url_ni = women_all_scarves zh_goods_list = [] zh_goods_i_id = 0 for page_number in range(1, 12): # page_number = 2 url = zh_url_i + str(zh_url_ni) + '&amp;pn=' + str(page_number) + zh_url_o # Soup = BeautifulSoup('html', 'lxml') try: Soup = openUrl(url) except Exception as e: print('error: ' + url) print(e) continue # 双次出现--商品名字、商品价格、商品编号 swipers = Soup.find_all(class_='spice-item-grid-info') goods_number = Soup.find_all(class_='spice-item-grid-img-box e-abtest-code-click') goods_prices = Soup.find_all(class_='spice-item-grid-price') # 单次出现--imgurl 商品图片链接 imgs = Soup.find_all(class_='visual-img') if (len(swipers) == 0): # 没有信息,结束循环 print(page_number) break else: for i in range(0, len(imgs)): # goods_number\swipers\goods_prices 为2次重复出现 所以2*i goods_item = goods_number[2 * i]['e-abtest-code'] goods_position = goods_number[2 * i]['e-abtest-position'] goods_name = swipers[2 * i].h2.text goods_price = re_str(goods_prices[2 * i].text).replace(',', '').replace('.', '') # 提取价格转为int product_price = re.findall("\d+", goods_price)[0] # imgs为单次出现 imgurls = json.loads(imgs[i]['spice-data-image-src']) # imgurls = re_str(imgs[i]['spice-data-image-src']) # img = imgurls.split(",") # img_url = img[1].split(":") # imgurl = 'https:'+img_url[2] # goods_img_url = imgurl[0:len(imgurl)-1] goods_img_url = imgurls['medium'] goods_url = 'https://www.gucci.cn/zh/pr/' + goods_item + '?nid=63&amp;listName=ProductGrid&amp;position=37&amp;categoryPath=' # try: # if download: # downUrl(goods_img_url, img_save_path + str(zh_goods_i_id) + '.jpg') zh_it_dict = {'product_title': goods_name, 'product_url': goods_url, 'product_prices': int(product_price), 'product_all_prices': '', 'product_img_url': goods_img_url, 'url_path': img_save_path + str(zh_goods_i_id) + '.jpg', 'productCode': goods_item } zh_goods_list.append(zh_it_dict) zh_goods_i_id = zh_goods_i_id + 1 # except Exception as e: # print('downerror: ' + goods_img_url) # print(e) # continue print(goods_item) print(goods_name) print(goods_price) print(goods_url) print(page_number) print(zh_goods_i_id) zh_goods_list.sort(key=function) pickle.dump(zh_goods_list, open('./gucci/' + download_type + '/zh_goods_list.txt', 'wb')) return None # ************************************* # ************************************* # gucci_it 组装url 新建文件夹 def download_gucci_it(download_type): img_save_path = './gucci/' + download_type + '/gucci_it_/' mkdir(img_save_path) # 意大利 男 包 衣服 鞋子 腰带 钱包 it_men_bags = 'men-bags' it_men_cloths = 'men-readytowear' it_men_shoes = 'men-shoes' it_men_belts = 'men-accessories-belts' it_men_wallets = 'men-accessories-wallets' it_men_scarves = 'men-accessories-scarves' it_women_bags = 'women-handbags' it_women_cloths = 'women-readytowear' it_women_shoes = 'women-shoes' it_women_belts = 'women-accessories-belts' it_women_wallets = 'women-accessories-wallets' it_women_scarves = 'women-accessories-silks-and-scarves' # 组装链接 it_url_in = 'https://www.gucci.com/it/it/c/productgrid?categoryCode=' it_url_out = '&amp;show=Page&amp;page=' it_url_type = '' if download_type == 'men_all_shoes': it_url_type = it_men_shoes elif download_type == 'men_all_bags': it_url_type = it_men_bags elif download_type == 'men_all_clothes': it_url_type = it_men_cloths elif download_type == 'men_all_belts': it_url_type = it_men_belts elif download_type == 'men_all_wallets': it_url_type = it_men_wallets elif download_type == 'men_all_scarves': it_url_type = it_men_scarves elif download_type == 'women_all_shoes': it_url_type = it_women_shoes elif download_type == 'women_all_bags': it_url_type = it_women_bags elif download_type == 'women_all_clothes': it_url_type = it_women_cloths elif download_type == 'women_all_belts': it_url_type = it_women_belts elif download_type == 'women_all_wallets': it_url_type = it_women_wallets elif download_type == 'women_all_scarves': it_url_type = it_women_scarves it_url = it_url_in + it_url_type + it_url_out it_goods_list = [] # 组装商品链接 it_goods_url_in = 'https://www.gucci.com/it/it/search?search-cat=header-search&amp;text=' it_goods_i_id = 0 print('loading.......') for page_number in range(1, 10): url = it_url + str(page_number) print(url) json_data = '' try: it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 json_data = response.read().decode("utf-8") # json_data = return_json(url) except Exception as e: print('error: ' + url) print(e) continue # except urllib.error.URLError as e: # print(e) # print('URLError: '+url) # continue # except socket.timeout as e: # count = 1 # print('第'+str(count)) # while count &lt;= 5: # try: # json_data = return_json(url) # except socket.timeout: # err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count # print(err_info) # count += 1 # if count &gt; 5: # print("downloading picture fialed!"+img_url) # continue # except Exception as e: # print('error: '+url) # print(e) # continue hjson = json.loads(json_data) items = hjson['products']['items'] if (len(items) &gt; 0): for i in range(0, len(items)): # 产品id、价格、产品名称、图片 productCode = items[i]['productCode'] price = items[i]['price'].replace(',', '').replace('.', '') # 提取价格转为int product_price = re.findall("\d+", price)[0] productName = items[i]['productName'] alternateImage = items[i]['primaryImage'] img_url = 'http:' + alternateImage['datasrcmedium'] # if download: # try: # print('img_url: ' + img_url) # urllib.request.urlretrieve(img_url, img_save_path + str(it_goods_i_id) + '.jpg') it_dict = {'product_title': productName, 'product_url': it_goods_url_in + productCode, 'product_prices': int(product_price), 'product_all_prices': '', 'product_img_url': img_url, 'url_path': img_save_path + str(it_goods_i_id) + '.jpg', 'productCode': productCode } it_goods_list.append(it_dict) it_goods_i_id = it_goods_i_id + 1 # except Exception as e: # print('error: ') # print(e) # continue # except socket.timeout: # count = 1 # print('timeout: '+str(count)) # while count &lt;= 5: # try: # print('第二次尝试timeout:'+str(count)) # urllib.request.urlretrieve(img_url, img_save_path + str(it_goods_i_id) + '.jpg') # # 第二次解决后 再加入list 否则跳过 # it_dict = {'product_title': productName, # 'product_url': it_goods_url_in + productCode, # 'product_prices': int(product_price), # 'product_all_prices': '', 'product_img_url': img_url, # 'url_path': img_save_path + str(it_goods_i_id) + '.jpg', # 'productCode': productCode # } # it_goods_list.append(it_dict) # it_goods_i_id = it_goods_i_id + 1 # # break # except socket.timeout: # err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count # print(err_info) # count += 1 # if count &gt; 5: # print("downloading picture fialed!"+img_url) # continue print(productCode) print(price) print(productName) print(img_url) else: # 没有信息 跳出循环 break it_goods_list.sort(key=function) pickle.dump(it_goods_list, open('./gucci/' + download_type + '/it_goods_list.txt', 'wb')) print('**********************************') print('**********************************') print('**********************************') print(it_goods_list) return None # ************************************* # gucci_kaola 组装url 新建文件夹 def download_kaola(download_type): img_save_path = './gucci/' + download_type + '/kaola_zh_/' mkdir(img_save_path) kaola_list = [] men_shoes_url_in = 'https://search.kaola.com/brand/1226-1078.html?pageSize=60&amp;pageNo=' men_shoes_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=1078&amp;key=&amp;changeContent=c&amp;#search_crumbs' men_wallets_url_in = 'https://www.kaola.com/brand/1226-6259.html?pageSize=60&amp;pageNo=' men_wallets_url_out = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111380&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=6259&amp;key=&amp;changeContent=isStock&amp;#search_crumbs' men_bag_url_i = 'https://search.kaola.com/brand/1226-1027.html?pageSize=60&amp;pageNo=' men_bag_url_O = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=1027&amp;key=&amp;changeContent=c&amp;#search_crumbs' men_cloths_url_i = 'https://search.kaola.com/brand/1226-1047.html?pageSize=60&amp;pageNo=' men_cloths_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=1047&amp;key=&amp;changeContent=c&amp;#search_crumbs' men_belt_url_i = 'https://search.kaola.com/brand/1226-1073.html?pageSize=60&amp;pageNo=' men_belt_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111380&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=1058&amp;backCategory=1073&amp;key=&amp;changeContent=0&amp;#search_crumbs' men_scarves_url_i = 'https://search.kaola.com/brand/1226-1072.html?pageSize=60&amp;pageNo=' men_scarves_url_o = '&amp;sortfield=0&amp;isStock=false&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111380&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=1058&amp;backCategory=1072&amp;key=&amp;changeContent=0&amp;#search_crumbs' women_shoes_url_in = 'https://search.kaola.com/brand/1226-1077.html?pageSize=60&amp;pageNo=' women_shoes_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111421&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=1077&amp;key=&amp;changeContent=c&amp;#search_crumbs' women_wallets_url_in = 'https://search.kaola.com/brand/1226-6259.html?pageSize=60&amp;pageNo=' women_wallets_url_out = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111421&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=6259&amp;key=&amp;changeContent=0&amp;#search_crumbs' women_bag_url_i = 'https://search.kaola.com/brand/1226-1028.html?pageSize=60&amp;pageNo=' women_bag_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111421&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=1028&amp;key=&amp;changeContent=c&amp;#search_crumbs' women_cloths_url_i = 'https://search.kaola.com/brand/1226-1048.html?pageSize=60&amp;pageNo=' women_cloths_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=1048&amp;key=&amp;changeContent=crumbs_0&amp;#search_crumbs' women_belt_url_i = 'https://search.kaola.com/brand/1226-1073.html?pageSize=60&amp;pageNo=' women_belt_url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111421&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=1058&amp;backCategory=1073&amp;key=&amp;changeContent=c&amp;#search_crumbs' women_scarves_url_i = 'https://search.kaola.com/brand/1226-1072.html?pageSize=60&amp;pageNo=' women_scarves_url_o = '&amp;sortfield=0&amp;isStock=false&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111421&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=1058&amp;backCategory=1072&amp;key=&amp;changeContent=0&amp;#search_crumbs' url_in = '' url_out = '' if download_type == 'men_all_shoes': url_in = men_shoes_url_in url_out = men_shoes_url_o elif download_type == 'men_all_bags': url_in = men_bag_url_i url_out = men_bag_url_O elif download_type == 'men_all_clothes': url_in = men_cloths_url_i url_out = men_cloths_url_o elif download_type == 'men_all_belts': url_in = men_belt_url_i url_out = men_belt_url_o elif download_type == 'men_all_wallets': url_in = men_wallets_url_in url_out = men_wallets_url_out elif download_type == 'men_all_scarves': url_in = men_scarves_url_i url_out = men_scarves_url_o elif download_type == 'women_all_shoes': url_in = women_shoes_url_in url_out = women_shoes_url_o elif download_type == 'women_all_bags': url_in = women_bag_url_i url_out = women_bag_url_o elif download_type == 'women_all_clothes': url_in = women_cloths_url_i url_out = women_cloths_url_o elif download_type == 'women_all_belts': url_in = women_belt_url_i url_out = women_belt_url_o elif download_type == 'women_all_wallets': url_in = women_wallets_url_in url_out = women_wallets_url_out elif download_type == 'women_all_scarves': url_in = women_scarves_url_i url_out = women_scarves_url_o # url_i = 'https://www.kaola.com/brand/1226-6259.html?pageSize=60&amp;pageNo=' # url_o = '&amp;sortfield=0&amp;isStock=true&amp;isSelfProduct=false&amp;isPromote=false&amp;isTaxFree=false&amp;isDesc=true&amp;proIds=100224_4111380&amp;lowerPrice=-1&amp;upperPrice=-1&amp;isBrand=0&amp;headCategoryId=-1&amp;backCategory=6259&amp;key=&amp;#topTab' # page_number = 3 product_number = 0 for page_number in range(1, 10): print(page_number) url = url_in + str(page_number) + url_out try: Soup = openUrl(url) except Exception as e: print(url) print('openUrl: ') print(e) continue # 起始分别从 12 11 11 11 1开始 数组位置从11 10 10 10 0开始 product_titles = Soup.find_all(class_="title") goods_prices_curs = Soup.find_all(class_='cur') goods_imgs = Soup.find_all(class_='img') goods_all_prices = Soup.find_all(class_='price') goods_colors = Soup.find_all(class_='skuwrap') sale_names = Soup.find_all(class_='selfflag') if (len(sale_names) == 0): break for list_i in range(0, len(goods_all_prices) - 10): # 产品名称 、 产品链接 、产品到手价格 、 产品图片链接 、 产品所有价格 print('***************************************') product_title = product_titles[list_i + 11]['title'] # 如果是钱包 就略过 # if('钱包' in product_title): # continue # product_url = 'https://goods.kaola.com' + product_titles[list_i + 11]['href'] product_url_b = product_titles[list_i + 11]['href'] if ('goods.kaola.com' in product_url_b): product_url = 'https:' + product_titles[list_i + 11]['href'] else: product_url = 'https://goods.kaola.com' + product_titles[list_i + 11]['href'] product_prices = goods_prices_curs[list_i + 10].text product_sales_name = re_str(sale_names[list_i].text) product_price = re.findall("\d+", product_prices)[0] # img_url = goods_imgs[10].img['data-src'] # product_img_url = 'http://' + img_url[2:len(img_url)] product_all_prices = goods_all_prices[list_i + 10].text print(product_title) print(product_url) print(product_prices) print(product_all_prices) img_url = goods_imgs[list_i + 10].img['data-src'] product_img_url = 'http://' + img_url[2:len(img_url)] product_other_color = '无' try: if (len(goods_colors) &gt; 0): goods_color_list = goods_colors[list_i].find_all(class_='skutag') product_other_color = '' for color_i in range(0, len(goods_color_list)): product_one_color = goods_color_list[color_i]['title'] product_other_color = product_one_color + ',' + product_other_color except Exception as e: print(e) continue # try: # if download: # downUrl(product_img_url, img_save_path + str(product_number) + '.jpg') # print(product_img_url) # # time.sleep(random.randint(0, 1)) kaola_dict = {'product_title': product_title, 'product_url': product_url, 'product_prices': int(product_price), 'product_all_prices': product_all_prices, 'product_img_url': product_img_url, 'product_sales_name': product_sales_name, 'product_other_color': product_other_color, 'url_path': img_save_path + str(product_number) + '.jpg'} kaola_list.append(kaola_dict) # print(product_sales_name) # print(product_other_color) # print(product_number) product_number = product_number + 1 # except Exception as e: # print("down error: ") # print(e) # continue print('---------------------------------------') kaola_list.sort(key=function) pickle.dump(kaola_list, open('./gucci/' + download_type + '/kaola_list.txt', 'wb')) print(kaola_list) print('**********************************') print('**********************************') print('**********************************') return None def write_excel(download_type, book): # ''' # 读取变量 kaola_list = pickle.load(open('./gucci/' + download_type + '/kaola_list.txt', 'rb')) it_goods_list = pickle.load(open('./gucci/' + download_type + '/it_goods_list.txt', 'rb')) zh_goods_list = pickle.load(open('./gucci/' + download_type + '/zh_goods_list.txt', 'rb')) sheet = book.add_worksheet(download_type) # 设置sheet表单元格列宽 sheet.set_column("A:A", 5) # 欧洲 sheet.set_column("B:B", 80.88) # 商品名称 sheet.set_column("C:C", 10.5) # 正面图 sheet.set_column("D:D", 19.38) # 货号 sheet.set_column("E:E", 12) # 欧洲零售价 sheet.set_column("F:F", 82) # 商品链接 sheet.set_column("G:G", 5) # 中国 sheet.set_column("H:H", 55) # 商品名称 sheet.set_column("I:I", 10.5) # 正面图 sheet.set_column("J:J", 19.38) # 货号 sheet.set_column("K:K", 8.25) # 欧洲零售价 sheet.set_column("L:L", 106) # 商品链接 sheet.set_column("M:M", 5) # 考拉 sheet.set_column("N:N", 89) # 商品名称 sheet.set_column("O:O", 11.5) # 正面图 sheet.set_column("P:P", 50.38) # 商品链接 sheet.set_column("Q:Q", 14.25) # 到手价 sheet.set_column("R:R", 25.75) # 所有售价 sheet.set_column("S:S", 25.75) # 备选颜色 sheet.set_column("T:T", 28.75) # 销售商家 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) # 设置sheet表单元格行高 sheet.set_row(0, 22) # 设置第一行的高度为22 # 在向单元格中写入内容时,加上单元格样式 # 插入第一行 sheet.write(0, 0, '欧洲', cell_format) sheet.write(0, 1, '商品名称', cell_format) sheet.write(0, 2, '正面图', cell_format) sheet.write(0, 3, '货号', cell_format) sheet.write(0, 4, '欧洲零售价', cell_format) sheet.write(0, 5, '商品链接', cell_format) sheet.write(0, 6, '中国', cell_format) sheet.write(0, 7, '商品名称', cell_format) sheet.write(0, 8, '正面图', cell_format) sheet.write(0, 9, '货号', cell_format) sheet.write(0, 10, '中国零售价', cell_format) sheet.write(0, 11, '商品链接', cell_format) sheet.write(0, 12, '考拉', cell_format) sheet.write(0, 13, '商品名称', cell_format) sheet.write(0, 14, '正面图', cell_format) sheet.write(0, 15, '商品链接', cell_format) sheet.write(0, 16, '到手价', cell_format) sheet.write(0, 17, '所有售价', cell_format) sheet.write(0, 18, '备选颜色', cell_format) sheet.write(0, 19, '销售商家', cell_format) gucci_zh_img_format = {'x_offset': 4, # 左右移动 'y_offset': 0, 'x_scale': 0.3, # 缩放比例 'y_scale': 0.29} img_format = {'x_offset': 4, # 左右移动 'y_offset': 0, 'x_scale': 0.2, # 缩放比例 'y_scale': 0.19} # 插入爬取zh_goods_list信息 row_number = 1 for product in zh_goods_list: sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 7, product['product_title'], cell_format) sheet.insert_image(row_number, 8, product['url_path'], gucci_zh_img_format) sheet.write(row_number, 9, product['productCode'], cell_format) # product['productCode'] sheet.write(row_number, 10, '¥' + str(product['product_prices']), cell_format) sheet.write(row_number, 11, product['product_url'], cell_format) row_number = row_number + 1 # 插入爬取it_goods_list信息 row_number = 1 for product in it_goods_list: sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 1, product['product_title'], cell_format) sheet.insert_image(row_number, 2, product['url_path'], img_format) sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode'] sheet.write(row_number, 4, '€' + str(product['product_prices']), cell_format) sheet.write(row_number, 5, product['product_url'], cell_format) row_number = row_number + 1 # 插入爬取kaola_list信息 row_number = 1 for product in kaola_list: sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 13, product['product_title'], cell_format) sheet.insert_image(row_number, 14, product['url_path'], img_format) sheet.write(row_number, 15, product['product_url'], cell_format) sheet.write(row_number, 16, '¥' + str(product['product_prices']), cell_format) sheet.write(row_number, 17, product['product_all_prices'], cell_format) sheet.write(row_number, 18, product['product_other_color'], cell_format) sheet.write(row_number, 19, product['product_sales_name'], cell_format) row_number = row_number + 1 # book.close() print(download_type + ': 写入EXCEL成功') return book def get_all_list(): men_all = ['men_all_shoes', 'men_all_bags', 'men_all_belts', 'men_all_clothes', 'men_all_wallets', 'men_all_scarves'] women_all = ['women_all_shoes', 'women_all_bags', 'women_all_belts', 'women_all_clothes', 'women_all_wallets', 'women_all_scarves'] return men_all, women_all def write_excel_to_path(men_all, excel_name): book = xlsxwriter.Workbook(excel_name) for i in range(0, len(men_all)): print(men_all[i]) book = write_excel(men_all[i], book) book.close() print(excel_name + ': 写入EXCEL成功') return None # 1、更改 当前下载类型 # 2、更改个URL # 3、打开下载开关 # men_all_types = [] # men_all_types.append('men_all_shoes') # 'men_all_shoes' 'men_all_bags' 'men_all_belts' 'men_all_clothes' 'men_all_wallets' # 'women_all_shoes' 'women_all_bags' 'women_all_belts' 'women_all_clothes' 'women_all_wallets' # download_type = 'women_all_wallets' # #download = False #True # download_gucci_zh(download_type) # download_gucci_it(download_type) # download_kaola(download_type) # write_excel(download_type) def pool_download_men(number): men_all, women_all = get_all_list() download_gucci_zh(men_all[number]) download_gucci_it(men_all[number]) download_kaola(men_all[number]) def pool_download_women(number): men_all, women_all = get_all_list() download_gucci_zh(women_all[number]) download_gucci_it(women_all[number]) download_kaola(women_all[number]) def strat_download(): pool = Pool() pool.map(pool_download_men, [i for i in range(6)]) # [0, 1, 2, 3, 4, 5] pool.map(pool_download_women, [i for i in range(6)]) pool.close() pool.join() # write_excel_to_path(men_all, 'men_all.xlsx') # write_excel_to_path(women_all, 'women_all.xlsx') # download = True # strat_download() class DownImgClass: """一个简单的类实例""" def __init__(self): self.imgurl_all_list = [] def start_download(self): men_all, women_all = get_all_list() # pool = Pool() for i in range(len(men_all)): self.multi_download_img(men_all[i]) for i in range(len(women_all)): self.multi_download_img(women_all[i]) # pool.close() # pool.join() def multi_download_img(self, download_type): # ''' self.imgurl_all_list = [] # 读取变量 kaola_list = pickle.load(open('./gucci/' + download_type + '/kaola_list.txt', 'rb')) it_goods_list = pickle.load(open('./gucci/' + download_type + '/it_goods_list.txt', 'rb')) zh_goods_list = pickle.load(open('./gucci/' + download_type + '/zh_goods_list.txt', 'rb')) print(zh_goods_list) # imgurl_all_list = [] print('read') for product in zh_goods_list: img_url = product['product_img_url'] img_path = product['url_path'] img_url_dict = {'img_url': img_url, 'img_path': img_path} self.imgurl_all_list.append(img_url_dict) for product in it_goods_list: img_url = product['product_img_url'] img_path = product['url_path'] img_url_dict = {'img_url': img_url, 'img_path': img_path} self.imgurl_all_list.append(img_url_dict) for product in kaola_list: img_url = product['product_img_url'] img_path = product['url_path'] img_url_dict = {'img_url': img_url, 'img_path': img_path} self.imgurl_all_list.append(img_url_dict) # param = {self: self, 'number': [i for i in range(len(self.imgurl_all_list))]} print(len(self.imgurl_all_list)) print('down2') pool = Pool() # partial_work = partial(self.second_multi_download_img, self=self) # 提取x作为partial函数的输入变量 # pool.map(partial_work, [i for i in range(len(self.imgurl_all_list))]) pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))]) pool.close() pool.join() print('down1') def second_multi_download_img(self, number): print(self.imgurl_all_list[number]) img_url = self.imgurl_all_list[number]['img_url'] url_path = self.imgurl_all_list[number]['img_path'] print('down') try: downUrl(img_url, url_path) except Exception as e: print(e) if __name__ == "__main__": a = time.time() # 获取信息 # strat_download() # 下载图片 down_img = DownImgClass() down_img.start_download() b = time.time() # 输出总耗时 print((b - a) / 60) </code></pre>

页面列表

ITEM_HTML