6、gucci_backup_2
<pre><code>from bs4 import BeautifulSoup
import urllib.request
# import pandas as pd
import ssl
import time
import random
import xlsxwriter
import re
import json
import os
import pickle
import socket
import sys
from functools import partial
from multiprocessing import Pool
# from PyQt5.QtWidgets import QApplication, QWidget, QPushButton, QHBoxLayout, QTextEdit
# 设置超时时间为30s
socket.setdefaulttimeout(30)
ssl._create_default_https_context = ssl._create_unverified_context
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
# 返回html的soup解析
def openUrl(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
# 返回html的soup解析
def return_json(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
# Soup = BeautifulSoup(html, 'lxml')
return html
# 正则化str
def re_str(str):
return str.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
# 下载图片到本地
def downUrl(url, path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name + ": success")
return None
def function(date):
return date['product_prices']
def download_gucci_zh(download_type):
# *************************************
# gucci_zh 获取时间戳 组装url 新建文件夹
now_time = time.time()
new_now_time = int(round(now_time * 1000)) # 毫秒级时间戳
print(new_now_time) ##1544524915363
img_save_path = './gucci/' + download_type + '/gucci_zh_/'
mkdir(img_save_path)
# url_men_all_clothes = 'https://www.gucci.cn/zh/itemList?pn=2&ni=97&direction=down&listName=ProductGrid&_=1544277575004'
# url = 'https://www.gucci.cn/zh/itemList?ni=16&pn=3&direction=down&_=1544276231765'
# belt_url = 'https://www.gucci.cn/zh/ca/men/accessories/belts?pn=1'
# 'https://www.gucci.cn/zh/itemList?pn=2&ni=97&direction=down&listName=ProductGrid&_=1544627533425'
# 领带65
men_all_clothes = 97
men_all_shoes = 17
men_all_bags = 16
men_all_belts = 64
men_all_wallets = 63
men_all_scarves = 66
women_all_clothes = 90
women_all_shoes = 13
women_all_bags = 12
women_all_belts = 41
women_all_wallets = 40
women_all_scarves = 42
zh_url_i = 'https://www.gucci.cn/zh/itemList?ni=' # '16&pn='#'https://www.gucci.cn/zh/itemList?pn='
zh_url_o = '&direction=down&listName=ProductGrid&_=' + str(
new_now_time) # &direction=down&_= '&ni=63&direction=down&listName=ProductGrid&_=1544011708050'
zh_url_ni = ''
if download_type == 'men_all_shoes':
zh_url_ni = men_all_shoes
elif download_type == 'men_all_bags':
zh_url_ni = men_all_bags
elif download_type == 'men_all_clothes':
zh_url_ni = men_all_clothes
elif download_type == 'men_all_belts':
zh_url_ni = men_all_belts
elif download_type == 'men_all_wallets':
zh_url_ni = men_all_wallets
elif download_type == 'men_all_scarves':
zh_url_ni = men_all_scarves
elif download_type == 'women_all_shoes':
zh_url_ni = women_all_shoes
elif download_type == 'women_all_bags':
zh_url_ni = women_all_bags
elif download_type == 'women_all_clothes':
zh_url_ni = women_all_clothes
elif download_type == 'women_all_belts':
zh_url_ni = women_all_belts
elif download_type == 'women_all_wallets':
zh_url_ni = women_all_wallets
elif download_type == 'women_all_scarves':
zh_url_ni = women_all_scarves
zh_goods_list = []
zh_goods_i_id = 0
for page_number in range(1, 12):
# page_number = 2
url = zh_url_i + str(zh_url_ni) + '&pn=' + str(page_number) + zh_url_o
# Soup = BeautifulSoup('html', 'lxml')
try:
Soup = openUrl(url)
except Exception as e:
print('error: ' + url)
print(e)
continue
# 双次出现--商品名字、商品价格、商品编号
swipers = Soup.find_all(class_='spice-item-grid-info')
goods_number = Soup.find_all(class_='spice-item-grid-img-box e-abtest-code-click')
goods_prices = Soup.find_all(class_='spice-item-grid-price')
# 单次出现--imgurl 商品图片链接
imgs = Soup.find_all(class_='visual-img')
if (len(swipers) == 0):
# 没有信息,结束循环
print(page_number)
break
else:
for i in range(0, len(imgs)):
# goods_number\swipers\goods_prices 为2次重复出现 所以2*i
goods_item = goods_number[2 * i]['e-abtest-code']
goods_position = goods_number[2 * i]['e-abtest-position']
goods_name = swipers[2 * i].h2.text
goods_price = re_str(goods_prices[2 * i].text).replace(',', '').replace('.', '')
# 提取价格转为int
product_price = re.findall("\d+", goods_price)[0]
# imgs为单次出现
imgurls = json.loads(imgs[i]['spice-data-image-src'])
# imgurls = re_str(imgs[i]['spice-data-image-src'])
# img = imgurls.split(",")
# img_url = img[1].split(":")
# imgurl = 'https:'+img_url[2]
# goods_img_url = imgurl[0:len(imgurl)-1]
goods_img_url = imgurls['medium']
goods_url = 'https://www.gucci.cn/zh/pr/' + goods_item + '?nid=63&listName=ProductGrid&position=37&categoryPath='
# try:
# if download:
# downUrl(goods_img_url, img_save_path + str(zh_goods_i_id) + '.jpg')
# zh_it_dict = {'product_title': goods_name,
# 'product_url': goods_url,
# 'product_prices': int(product_price),
# 'product_all_prices': '',
# 'product_img_url': goods_img_url,
# 'url_path': img_save_path + str(zh_goods_i_id) + '.jpg',
# 'productCode': goods_item
# }
# zh_goods_list.append(zh_it_dict)
# zh_goods_i_id = zh_goods_i_id + 1
# except Exception as e:
# print('downerror: ' + goods_img_url)
# print(e)
# continue
print(goods_item)
print(goods_name)
print(goods_price)
print(goods_url)
print(page_number)
print(zh_goods_i_id)
zh_goods_list.sort(key=function)
pickle.dump(zh_goods_list, open('./gucci/' + download_type + '/zh_goods_list.txt', 'wb'))
return None
# *************************************
# *************************************
# gucci_it 组装url 新建文件夹
def download_gucci_it(download_type):
img_save_path = './gucci/' + download_type + '/gucci_it_/'
mkdir(img_save_path)
# 意大利 男 包 衣服 鞋子 腰带 钱包
it_men_bags = 'men-bags'
it_men_cloths = 'men-readytowear'
it_men_shoes = 'men-shoes'
it_men_belts = 'men-accessories-belts'
it_men_wallets = 'men-accessories-wallets'
it_men_scarves = 'men-accessories-scarves'
it_women_bags = 'women-handbags'
it_women_cloths = 'women-readytowear'
it_women_shoes = 'women-shoes'
it_women_belts = 'women-accessories-belts'
it_women_wallets = 'women-accessories-wallets'
it_women_scarves = 'women-accessories-silks-and-scarves'
# 组装链接
it_url_in = 'https://www.gucci.com/it/it/c/productgrid?categoryCode='
it_url_out = '&show=Page&page='
it_url_type = ''
if download_type == 'men_all_shoes':
it_url_type = it_men_shoes
elif download_type == 'men_all_bags':
it_url_type = it_men_bags
elif download_type == 'men_all_clothes':
it_url_type = it_men_cloths
elif download_type == 'men_all_belts':
it_url_type = it_men_belts
elif download_type == 'men_all_wallets':
it_url_type = it_men_wallets
elif download_type == 'men_all_scarves':
it_url_type = it_men_scarves
elif download_type == 'women_all_shoes':
it_url_type = it_women_shoes
elif download_type == 'women_all_bags':
it_url_type = it_women_bags
elif download_type == 'women_all_clothes':
it_url_type = it_women_cloths
elif download_type == 'women_all_belts':
it_url_type = it_women_belts
elif download_type == 'women_all_wallets':
it_url_type = it_women_wallets
elif download_type == 'women_all_scarves':
it_url_type = it_women_scarves
it_url = it_url_in + it_url_type + it_url_out
it_goods_list = []
# 组装商品链接
it_goods_url_in = 'https://www.gucci.com/it/it/search?search-cat=header-search&text='
it_goods_i_id = 0
print('loading.......')
for page_number in range(1, 10):
url = it_url + str(page_number)
print(url)
json_data = ''
try:
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
json_data = response.read().decode("utf-8")
# json_data = return_json(url)
except Exception as e:
print('error: ' + url)
print(e)
continue
# except urllib.error.URLError as e:
# print(e)
# print('URLError: '+url)
# continue
# except socket.timeout as e:
# count = 1
# print('第'+str(count))
# while count <= 5:
# try:
# json_data = return_json(url)
# except socket.timeout:
# err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
# print(err_info)
# count += 1
# if count > 5:
# print("downloading picture fialed!"+img_url)
# continue
# except Exception as e:
# print('error: '+url)
# print(e)
# continue
hjson = json.loads(json_data)
items = hjson['products']['items']
if (len(items) > 0):
for i in range(0, len(items)):
# 产品id、价格、产品名称、图片
productCode = items[i]['productCode']
price = items[i]['price'].replace(',', '').replace('.', '')
# 提取价格转为int
product_price = re.findall("\d+", price)[0]
productName = items[i]['productName']
alternateImage = items[i]['primaryImage']
img_url = 'http:' + alternateImage['datasrcmedium']
# if download:
# try:
# print('img_url: ' + img_url)
# urllib.request.urlretrieve(img_url, img_save_path + str(it_goods_i_id) + '.jpg')
# it_dict = {'product_title': productName,
# 'product_url': it_goods_url_in + productCode,
# 'product_prices': int(product_price),
# 'product_all_prices': '', 'product_img_url': img_url,
# 'url_path': img_save_path + str(it_goods_i_id) + '.jpg',
# 'productCode': productCode
# }
# it_goods_list.append(it_dict)
# it_goods_i_id = it_goods_i_id + 1
# except Exception as e:
# print('error: ')
# print(e)
# continue
# except socket.timeout:
# count = 1
# print('timeout: '+str(count))
# while count <= 5:
# try:
# print('第二次尝试timeout:'+str(count))
# urllib.request.urlretrieve(img_url, img_save_path + str(it_goods_i_id) + '.jpg')
# # 第二次解决后 再加入list 否则跳过
# it_dict = {'product_title': productName,
# 'product_url': it_goods_url_in + productCode,
# 'product_prices': int(product_price),
# 'product_all_prices': '', 'product_img_url': img_url,
# 'url_path': img_save_path + str(it_goods_i_id) + '.jpg',
# 'productCode': productCode
# }
# it_goods_list.append(it_dict)
# it_goods_i_id = it_goods_i_id + 1
# # break
# except socket.timeout:
# err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
# print(err_info)
# count += 1
# if count > 5:
# print("downloading picture fialed!"+img_url)
# continue
print(productCode)
print(price)
print(productName)
print(img_url)
else:
# 没有信息 跳出循环
break
it_goods_list.sort(key=function)
pickle.dump(it_goods_list, open('./gucci/' + download_type + '/it_goods_list.txt', 'wb'))
print('**********************************')
print('**********************************')
print('**********************************')
print(it_goods_list)
return None
# *************************************
# gucci_kaola 组装url 新建文件夹
def download_kaola(download_type):
img_save_path = './gucci/' + download_type + '/kaola_zh_/'
mkdir(img_save_path)
kaola_list = []
men_shoes_url_in = 'https://search.kaola.com/brand/1226-1078.html?pageSize=60&pageNo='
men_shoes_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=1078&key=&changeContent=c&#search_crumbs'
men_wallets_url_in = 'https://www.kaola.com/brand/1226-6259.html?pageSize=60&pageNo='
men_wallets_url_out = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111380&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=6259&key=&changeContent=isStock&#search_crumbs'
men_bag_url_i = 'https://search.kaola.com/brand/1226-1027.html?pageSize=60&pageNo='
men_bag_url_O = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=1027&key=&changeContent=c&#search_crumbs'
men_cloths_url_i = 'https://search.kaola.com/brand/1226-1047.html?pageSize=60&pageNo='
men_cloths_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=1047&key=&changeContent=c&#search_crumbs'
men_belt_url_i = 'https://search.kaola.com/brand/1226-1073.html?pageSize=60&pageNo='
men_belt_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111380&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=1058&backCategory=1073&key=&changeContent=0&#search_crumbs'
men_scarves_url_i = 'https://search.kaola.com/brand/1226-1072.html?pageSize=60&pageNo='
men_scarves_url_o = '&sortfield=0&isStock=false&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111380&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=1058&backCategory=1072&key=&changeContent=0&#search_crumbs'
women_shoes_url_in = 'https://search.kaola.com/brand/1226-1077.html?pageSize=60&pageNo='
women_shoes_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111421&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=1077&key=&changeContent=c&#search_crumbs'
women_wallets_url_in = 'https://search.kaola.com/brand/1226-6259.html?pageSize=60&pageNo='
women_wallets_url_out = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111421&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=6259&key=&changeContent=0&#search_crumbs'
women_bag_url_i = 'https://search.kaola.com/brand/1226-1028.html?pageSize=60&pageNo='
women_bag_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111421&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=1028&key=&changeContent=c&#search_crumbs'
women_cloths_url_i = 'https://search.kaola.com/brand/1226-1048.html?pageSize=60&pageNo='
women_cloths_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=1048&key=&changeContent=crumbs_0&#search_crumbs'
women_belt_url_i = 'https://search.kaola.com/brand/1226-1073.html?pageSize=60&pageNo='
women_belt_url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111421&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=1058&backCategory=1073&key=&changeContent=c&#search_crumbs'
women_scarves_url_i = 'https://search.kaola.com/brand/1226-1072.html?pageSize=60&pageNo='
women_scarves_url_o = '&sortfield=0&isStock=false&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111421&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=1058&backCategory=1072&key=&changeContent=0&#search_crumbs'
url_in = ''
url_out = ''
if download_type == 'men_all_shoes':
url_in = men_shoes_url_in
url_out = men_shoes_url_o
elif download_type == 'men_all_bags':
url_in = men_bag_url_i
url_out = men_bag_url_O
elif download_type == 'men_all_clothes':
url_in = men_cloths_url_i
url_out = men_cloths_url_o
elif download_type == 'men_all_belts':
url_in = men_belt_url_i
url_out = men_belt_url_o
elif download_type == 'men_all_wallets':
url_in = men_wallets_url_in
url_out = men_wallets_url_out
elif download_type == 'men_all_scarves':
url_in = men_scarves_url_i
url_out = men_scarves_url_o
elif download_type == 'women_all_shoes':
url_in = women_shoes_url_in
url_out = women_shoes_url_o
elif download_type == 'women_all_bags':
url_in = women_bag_url_i
url_out = women_bag_url_o
elif download_type == 'women_all_clothes':
url_in = women_cloths_url_i
url_out = women_cloths_url_o
elif download_type == 'women_all_belts':
url_in = women_belt_url_i
url_out = women_belt_url_o
elif download_type == 'women_all_wallets':
url_in = women_wallets_url_in
url_out = women_wallets_url_out
elif download_type == 'women_all_scarves':
url_in = women_scarves_url_i
url_out = women_scarves_url_o
# url_i = 'https://www.kaola.com/brand/1226-6259.html?pageSize=60&pageNo='
# url_o = '&sortfield=0&isStock=true&isSelfProduct=false&isPromote=false&isTaxFree=false&isDesc=true&proIds=100224_4111380&lowerPrice=-1&upperPrice=-1&isBrand=0&headCategoryId=-1&backCategory=6259&key=&#topTab'
# page_number = 3
product_number = 0
for page_number in range(1, 10):
print(page_number)
url = url_in + str(page_number) + url_out
try:
Soup = openUrl(url)
except Exception as e:
print(url)
print('openUrl: ')
print(e)
continue
# 起始分别从 12 11 11 11 1开始 数组位置从11 10 10 10 0开始
product_titles = Soup.find_all(class_="title")
goods_prices_curs = Soup.find_all(class_='cur')
goods_imgs = Soup.find_all(class_='img')
goods_all_prices = Soup.find_all(class_='price')
goods_colors = Soup.find_all(class_='skuwrap')
sale_names = Soup.find_all(class_='selfflag')
if (len(sale_names) == 0):
break
for list_i in range(0, len(goods_all_prices) - 10):
# 产品名称 、 产品链接 、产品到手价格 、 产品图片链接 、 产品所有价格
print('***************************************')
product_title = product_titles[list_i + 11]['title']
# 如果是钱包 就略过
# if('钱包' in product_title):
# continue
# product_url = 'https://goods.kaola.com' + product_titles[list_i + 11]['href']
product_url_b = product_titles[list_i + 11]['href']
if ('goods.kaola.com' in product_url_b):
product_url = 'https:' + product_titles[list_i + 11]['href']
else:
product_url = 'https://goods.kaola.com' + product_titles[list_i + 11]['href']
product_prices = goods_prices_curs[list_i + 10].text
product_sales_name = re_str(sale_names[list_i].text)
product_price = re.findall("\d+", product_prices)[0]
# img_url = goods_imgs[10].img['data-src']
# product_img_url = 'http://' + img_url[2:len(img_url)]
product_all_prices = goods_all_prices[list_i + 10].text
print(product_title)
print(product_url)
print(product_prices)
print(product_all_prices)
img_url = goods_imgs[list_i + 10].img['data-src']
product_img_url = 'http://' + img_url[2:len(img_url)]
product_other_color = '无'
try:
if (len(goods_colors) > 0):
goods_color_list = goods_colors[list_i].find_all(class_='skutag')
product_other_color = ''
for color_i in range(0, len(goods_color_list)):
product_one_color = goods_color_list[color_i]['title']
product_other_color = product_one_color + ',' + product_other_color
except Exception as e:
print(e)
continue
# try:
# if download:
# downUrl(product_img_url, img_save_path + str(product_number) + '.jpg')
# print(product_img_url)
# # time.sleep(random.randint(0, 1))
# kaola_dict = {'product_title': product_title,
# 'product_url': product_url,
# 'product_prices': int(product_price),
# 'product_all_prices': product_all_prices,
# 'product_img_url': product_img_url,
# 'product_sales_name': product_sales_name,
# 'product_other_color': product_other_color,
# 'url_path': img_save_path + str(product_number) + '.jpg'}
# kaola_list.append(kaola_dict)
# print(product_sales_name)
# print(product_other_color)
# print(product_number)
# product_number = product_number + 1
# except Exception as e:
# print("down error: ")
# print(e)
# continue
print('---------------------------------------')
kaola_list.sort(key=function)
pickle.dump(kaola_list, open('./gucci/' + download_type + '/kaola_list.txt', 'wb'))
print(kaola_list)
print('**********************************')
print('**********************************')
print('**********************************')
return None
# def write_all_excel(book, download_type):
# sheet = book.add_worksheet(download_type)
def write_excel(download_type, book):
# '''
# 读取变量
kaola_list = pickle.load(open('./gucci/' + download_type + '/kaola_list.txt', 'rb'))
it_goods_list = pickle.load(open('./gucci/' + download_type + '/it_goods_list.txt', 'rb'))
zh_goods_list = pickle.load(open('./gucci/' + download_type + '/zh_goods_list.txt', 'rb'))
sheet = book.add_worksheet(download_type)
# 设置sheet表单元格列宽
sheet.set_column("A:A", 5) # 欧洲
sheet.set_column("B:B", 80.88) # 商品名称
sheet.set_column("C:C", 10.5) # 正面图
sheet.set_column("D:D", 19.38) # 货号
sheet.set_column("E:E", 12) # 欧洲零售价
sheet.set_column("F:F", 82) # 商品链接
sheet.set_column("G:G", 5) # 中国
sheet.set_column("H:H", 55) # 商品名称
sheet.set_column("I:I", 10.5) # 正面图
sheet.set_column("J:J", 19.38) # 货号
sheet.set_column("K:K", 8.25) # 欧洲零售价
sheet.set_column("L:L", 106) # 商品链接
sheet.set_column("M:M", 5) # 考拉
sheet.set_column("N:N", 89) # 商品名称
sheet.set_column("O:O", 11.5) # 正面图
sheet.set_column("P:P", 50.38) # 商品链接
sheet.set_column("Q:Q", 14.25) # 到手价
sheet.set_column("R:R", 25.75) # 所有售价
sheet.set_column("S:S", 25.75) # 备选颜色
sheet.set_column("T:T", 28.75) # 销售商家
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, '欧洲', cell_format)
sheet.write(0, 1, '商品名称', cell_format)
sheet.write(0, 2, '正面图', cell_format)
sheet.write(0, 3, '货号', cell_format)
sheet.write(0, 4, '欧洲零售价', cell_format)
sheet.write(0, 5, '商品链接', cell_format)
sheet.write(0, 6, '中国', cell_format)
sheet.write(0, 7, '商品名称', cell_format)
sheet.write(0, 8, '正面图', cell_format)
sheet.write(0, 9, '货号', cell_format)
sheet.write(0, 10, '中国零售价', cell_format)
sheet.write(0, 11, '商品链接', cell_format)
sheet.write(0, 12, '考拉', cell_format)
sheet.write(0, 13, '商品名称', cell_format)
sheet.write(0, 14, '正面图', cell_format)
sheet.write(0, 15, '商品链接', cell_format)
sheet.write(0, 16, '到手价', cell_format)
sheet.write(0, 17, '所有售价', cell_format)
sheet.write(0, 18, '备选颜色', cell_format)
sheet.write(0, 19, '销售商家', cell_format)
gucci_zh_img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.3, # 缩放比例
'y_scale': 0.29}
img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.2, # 缩放比例
'y_scale': 0.19}
# 插入爬取zh_goods_list信息
row_number = 1
for product in zh_goods_list:
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 7, product['product_title'], cell_format)
sheet.insert_image(row_number, 8, product['url_path'], gucci_zh_img_format)
sheet.write(row_number, 9, product['productCode'], cell_format) # product['productCode']
sheet.write(row_number, 10, '¥' + str(product['product_prices']), cell_format)
sheet.write(row_number, 11, product['product_url'], cell_format)
row_number = row_number + 1
# 插入爬取it_goods_list信息
row_number = 1
for product in it_goods_list:
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 1, product['product_title'], cell_format)
sheet.insert_image(row_number, 2, product['url_path'], img_format)
sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode']
sheet.write(row_number, 4, '€' + str(product['product_prices']), cell_format)
sheet.write(row_number, 5, product['product_url'], cell_format)
row_number = row_number + 1
# 插入爬取kaola_list信息
row_number = 1
for product in kaola_list:
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 13, product['product_title'], cell_format)
sheet.insert_image(row_number, 14, product['url_path'], img_format)
sheet.write(row_number, 15, product['product_url'], cell_format)
sheet.write(row_number, 16, '¥' + str(product['product_prices']), cell_format)
sheet.write(row_number, 17, product['product_all_prices'], cell_format)
sheet.write(row_number, 18, product['product_other_color'], cell_format)
sheet.write(row_number, 19, product['product_sales_name'], cell_format)
row_number = row_number + 1
# book.close()
print(download_type + ': 写入EXCEL成功')
return book
def get_all_list():
men_all = ['men_all_shoes', 'men_all_bags', 'men_all_belts', 'men_all_clothes', 'men_all_wallets', 'men_all_scarves']
women_all = ['women_all_shoes', 'women_all_bags', 'women_all_belts', 'women_all_clothes', 'women_all_wallets', 'women_all_scarves']
return men_all, women_all
def write_excel_to_path(men_all, excel_name):
book = xlsxwriter.Workbook(excel_name)
for i in range(0, len(men_all)):
print(men_all[i])
book = write_excel(men_all[i], book)
book.close()
print(excel_name + ': 写入EXCEL成功')
return None
# 1、更改 当前下载类型
# 2、更改个URL
# 3、打开下载开关
# men_all_types = []
# men_all_types.append('men_all_shoes')
# 'men_all_shoes' 'men_all_bags' 'men_all_belts' 'men_all_clothes' 'men_all_wallets'
# 'women_all_shoes' 'women_all_bags' 'women_all_belts' 'women_all_clothes' 'women_all_wallets'
# download_type = 'women_all_wallets'
# #download = False #True
# download_gucci_zh(download_type)
# download_gucci_it(download_type)
# download_kaola(download_type)
# write_excel(download_type)
def pool_download_men(number):
men_all, women_all = get_all_list()
#for i in range(0, number):
download_gucci_zh(men_all[number])
download_gucci_it(men_all[number])
download_kaola(men_all[number])
def pool_download_women(number):
men_all, women_all = get_all_list()
#for i in range(0, number):
download_gucci_zh(women_all[number])
download_gucci_it(women_all[number])
download_kaola(women_all[number])
def strat_download():
men_all, women_all = get_all_list()
pool = Pool()
# pool.map(pool_download_men, [i for i in range(5)]) #[0, 1, 2, 3, 4]
# pool.map(pool_download_women, [i for i in range(5)])
pool_download_men(5)
pool_download_women(5)
pool.close()
pool.join()
'''
for i in range(0, len(men_all)):
download_gucci_zh(men_all[i])
download_gucci_it(men_all[i])
download_kaola(men_all[i])
# if (men_all[i] == 'men_all_shoes'):
# download_gucci_zh(men_all[i])
# download_gucci_it(men_all[i])
# download_kaola(men_all[i])
# else:
# download_gucci_zh(men_all[i])
# download_gucci_it(men_all[i])
# download_kaola(men_all[i])
for i in range(0, len(women_all)):
download_gucci_zh(women_all[i])
download_gucci_it(women_all[i])
download_kaola(women_all[i])
'''
# write_excel_to_path(men_all, 'men_all.xlsx')
# write_excel_to_path(women_all, 'women_all.xlsx')
# download = True
# strat_download()
class DownImgClass:
"""一个简单的类实例"""
def __init__(self):
self.imgurl_all_list = []
def start_download(self):
men_all, women_all = get_all_list()
pool = Pool()
for i in range(len(men_all)):
self.multi_download_img(men_all[i],pool)
for i in range(len(women_all)):
self.multi_download_img(women_all[i],pool)
pool.close()
pool.join()
def multi_download_img(self, download_type,pool):
# '''
self.imgurl_all_list = []
# 读取变量
kaola_list = pickle.load(open('./gucci/' + download_type + '/kaola_list.txt', 'rb'))
it_goods_list = pickle.load(open('./gucci/' + download_type + '/it_goods_list.txt', 'rb'))
zh_goods_list = pickle.load(open('./gucci/' + download_type + '/zh_goods_list.txt', 'rb'))
print(zh_goods_list)
# imgurl_all_list = []
print('read')
for product in zh_goods_list:
img_url = product['product_url']
img_path = product['url_path']
img_url_dict = {'img_url': img_url,
'img_path': img_path}
self.imgurl_all_list.append(img_url_dict)
for product in it_goods_list:
img_url = product['product_url']
img_path = product['url_path']
img_url_dict = {'img_url': img_url,
'img_path': img_path}
self.imgurl_all_list.append(img_url_dict)
for product in kaola_list:
img_url = product['product_url']
img_path = product['url_path']
img_url_dict = {'img_url': img_url,
'img_path': img_path}
self.imgurl_all_list.append(img_url_dict)
#param = {self: self, 'number': [i for i in range(len(self.imgurl_all_list))]}
print(len(self.imgurl_all_list))
print('down2')
partial_work = partial(self.second_multi_download_img, self=self) # 提取x作为partial函数的输入变量
pool.map(partial_work, [i for i in range(len(self.imgurl_all_list))])
#pool.map(self.second_multi_download_img, )
print('down1')
def second_multi_download_img(self, number):
img_url = self.imgurl_all_list[number]['img_url']
url_path = self.imgurl_all_list[number]['url_path']
print('down')
try:
downUrl(img_url, url_path)
except Exception as e:
print(e)
if __name__ == "__main__":
a = time.time()
# download = True # True
# download = False # True
# strat_download()
down_img = DownImgClass()
down_img.start_download()
b = time.time()
print(a - b)
# str_text = 'hello word'
# app = QApplication(sys.argv)
# w = QWidget()
# w.resize(350, 450)
# w.move(400, 300)
# w.setWindowTitle('Simple')
#
# qText = QTextEdit()
# qText.setText('hello word')
# updateBtn = QPushButton('修改')
# updateBtn.setStyleSheet(''' text-align : center;
# background-color : NavajoWhite;
# height : 30px;
# border-style: outset;
# font : 13px ''')
# updateBtn.clicked.connect(strat_download)
# # updateBtn.mouseDoubleClickEvent()
# hLayout = QHBoxLayout()
# hLayout.addWidget(updateBtn)
# hLayout.addWidget(qText)
# w.setLayout(hLayout)
# w.show()
# sys.exit(app.exec_())
</code></pre>