1、yangmatou
<pre><code>
import urllib.request
import ssl
from multiprocessing import Pool
import re
import json
import time
import random
import xlsxwriter
import os
import pickle
import socket
# 设置超时时间为30s
socket.setdefaulttimeout(30)
ssl._create_default_https_context = ssl._create_unverified_context
# https://www.ymatou.com/products/api/getProductListByCondition?conditions%5B0%5D%5Bfilter%5D=
# ibrandid&conditions%5B0%5D%5Bvalue%5D%5B%5D=10134&conditions%5B0%5D%5Bop%5D=or&
# copuonCode=&keyword=&tagType=3&tagValue=1178&pageIndex=1
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
# 返回html的soup解析
def open_url(url):
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
# Soup = BeautifulSoup(html, 'lxml')
return html
new_url_i = 'https://www.ymatou.com/products/api/getProductListByCondition?copuonCode=&keyword=gucci&tagType=&tagValue=&pageIndex='
url_i = 'https://www.ymatou.com/sellerhome/api/SellerProduct?callback=' \
'jQuery33107037562205245479_1552098836879&sellerId=21948758&pageIndex='
url_o = '&brandId=10134&categoryId=0&priceSort=&hasCoupon=1&pageStart='
url_e = '&_=1552098836885'
url2 = 'https://www.ymatou.com/sellerhome/api/SellerProduct?callback=' \
'jQuery33107037562205245479_1552098836879&sellerId=21948758&pageIndex=' \
'3&brandId=10134&categoryId=0&priceSort=&hasCoupon=1&pageStart=80&_=1552098836886'
def get_product():
product_infos = []
# url = url_i + str(1) + url_o + str(0) + url_e
url = new_url_i + str(1)
product_json = open_url(url)
json_data = json.loads(product_json)
product_list = json_data['result']['Products']
product_total = json_data['result']['Total']
# product_json = open_url(url)
# result = re.search('\{\"Code.*?(.*?).*?\}\]\}\}\}', product_json)
# data = result.group()
# json_data = json.loads(data)
# product_list = json_data['Data']['ProductInfo']['ProductList']
# product_total = json_data['Data']['ProductInfo']['Total']
page_nums = product_total//50 + 1
for product in product_list:
product_info = \
{
'MinPrice': product['MinPrice'],
'MaxPrice': product['MaxPrice'],
'SellerName': product['SellerName'],
'MinVipPrice': product['MinVipPrice'],
'product_img_url': product['MainPic'],
# 'TaxFarming': product['TaxFarming'],
# 'FreeShipping': product['FreeShipping'],
'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html',
'product_title': product['Title'],
'product_price': product['MinPrice']
}
product_infos.append(product_info)
# for product in product_list:
# product_info = \
# {
# 'product_img_url': product['PicUrl'],
# 'TaxFarming': product['TaxFarming'],
# 'FreeShipping': product['FreeShipping'],
# 'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html',
# 'product_title': product['Title'],
# 'product_price': product['Price']
# }
# product_infos.append(product_info)
# print(product['Title'])
for i in range(2, page_nums + 1):
print('pages: ' + str(i))
url = new_url_i + str(i)
product_json = open_url(url)
json_data = json.loads(product_json)
try:
product_list = json_data['result']['Products']
except Exception as e:
print(e)
# product_total = json_data['result']['Total']
# url = url_i + str(i) + url_o + str((i-1)*40) + url_e
# product_json = open_url(url)
# result = re.search('\{\"Code.*?(.*?).*?\}\]\}\}\}', product_json)
# data = result.group()
# json_data = json.loads(data)
# product_list = json_data['Data']['ProductInfo']['ProductList']
# product_total = json_data['Data']['ProductInfo']['Total']
# page_nums = product_total / 40 + 1
for product in product_list:
product_info = \
{
# 'MinPrice' : 'MinPrice',
'MinPrice': product['MinPrice'],
'MaxPrice': product['MaxPrice'],
'SellerName': product['SellerName'],
'MinVipPrice': product['MinVipPrice'],
'product_img_url': product['MainPic'],
# 'TaxFarming': product['TaxFarming'],
# 'FreeShipping': product['FreeShipping'],
'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html',
'product_title': product['Title'],
'product_price': product['MinPrice']
}
product_infos.append(product_info)
# for product in product_list:
# product_info = \
# {
# 'product_img_url': product['PicUrl'],
# 'TaxFarming': product['TaxFarming'],
# 'FreeShipping': product['FreeShipping'],
# 'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html',
# 'product_title': product['Title'],
# 'product_price': product['Price']
# }
# product_infos.append(product_info)
pickle.dump(product_infos, open(work_path + 'yangmatou' + '.txt', 'wb'))
print(': success')
def get_product_old():
product_infos = []
url = url_i + str(1) + url_o + str(0) + url_e
product_json = open_url(url)
result = re.search('\{\"Code.*?(.*?).*?\}\]\}\}\}', product_json)
data = result.group()
json_data = json.loads(data)
product_list = json_data['Data']['ProductInfo']['ProductList']
product_total = json_data['Data']['ProductInfo']['Total']
page_nums = product_total//40 + 1
for product in product_list:
product_info = \
{
'product_img_url': product['PicUrl'],
'TaxFarming': product['TaxFarming'],
'FreeShipping': product['FreeShipping'],
'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html',
'product_title': product['Title'],
'product_price': product['Price']
}
product_infos.append(product_info)
# print(product['Title'])
for i in range(2, page_nums + 1):
url = url_i + str(i) + url_o + str((i-1)*40) + url_e
product_json = open_url(url)
result = re.search('\{\"Code.*?(.*?).*?\}\]\}\}\}', product_json)
data = result.group()
json_data = json.loads(data)
product_list = json_data['Data']['ProductInfo']['ProductList']
product_total = json_data['Data']['ProductInfo']['Total']
page_nums = product_total / 40 + 1
for product in product_list:
product_info = \
{
'product_img_url': product['PicUrl'],
'TaxFarming': product['TaxFarming'],
'FreeShipping': product['FreeShipping'],
'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html',
'product_title': product['Title'],
'product_price': product['Price']
}
product_infos.append(product_info)
pickle.dump(product_infos, open(work_path + 'yangmatou' + '.txt', 'wb'))
print(': success')
def write_excel_old(download_type, book):
# '''
# 读取变量
it_goods_list = pickle.load(open(work_path + download_type + '.txt', 'rb'))
sheet = book.add_worksheet(download_type)
# 设置sheet表单元格列宽
sheet.set_column("A:A", 11) # 聚美
sheet.set_column("B:B", 102) # 商品名称
sheet.set_column("C:C", 20.5) # 正面图
sheet.set_column("D:D", 18.25) # 备注
sheet.set_column("E:E", 106) # 聚美优惠价
# sheet.set_column("F:F", 12) # v原售价
# sheet.set_column("G:G", 106) # 商品链接
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, 'yangmatou', cell_format)
sheet.write(0, 1, '商品名称', cell_format)
sheet.write(0, 2, '正面图', cell_format)
sheet.write(0, 3, '优惠价', cell_format)
# sheet.write(0, 4, '原售价', cell_format)
sheet.write(0, 5, '类型', cell_format)
sheet.write(0, 4, '商品链接', cell_format)
img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.2, # 缩放比例
'y_scale': 0.19}
# 插入爬取it_goods_list信息
row_number = 1
for product in it_goods_list:
sheet.set_row(row_number, 102) # 设置第row_number行的高度为52
sheet.write(row_number, 1, product['product_title'], cell_format)
sheet.insert_image(row_number, 2, product['product_img_url_path'], img_format)
sheet.write(row_number, 3, str(product['product_price']), cell_format)
# sheet.write(row_number, 4, str(product['product_price_old']), cell_format)
# sheet.write(row_number, 5, str(product['products_info']) + ';' + product['product_state'], cell_format)
sheet.write(row_number, 4, product['product_url'], cell_format)
# if '男' in product['product_title']:
# if '钱包' in product['product_title']:
# sheet.write(row_number, 5, '男钱包', cell_format)
# elif '围巾' in product['product_title']:
# sheet.write(row_number, 5, '男围巾', cell_format)
# elif '包' in product['product_title']:
# sheet.write(row_number, 5, '男包', cell_format)
# elif '鞋' in product['product_title']:
# sheet.write(row_number, 5, '男鞋', cell_format)
# elif '带' in product['product_title']:
# sheet.write(row_number, 5, '男腰带', cell_format)
# if '女' in product['product_title']:
# if '钱包' in product['product_title']:
# sheet.write(row_number, 5, '女钱包', cell_format)
# elif '围巾' in product['product_title']:
# sheet.write(row_number, 5, '女围巾', cell_format)
# elif '包' in product['product_title']:
# sheet.write(row_number, 5, '女包', cell_format)
# elif '鞋' in product['product_title']:
# sheet.write(row_number, 5, '女鞋', cell_format)
# elif '带' in product['product_title']:
# sheet.write(row_number, 5, '女腰带', cell_format)
if '钱包' in product['product_title']:
sheet.write(row_number, 5, '钱包', cell_format)
elif '披肩' in product['product_title']:
sheet.write(row_number, 5, '围巾', cell_format)
elif '围巾' in product['product_title']:
sheet.write(row_number, 5, '围巾', cell_format)
elif '包' in product['product_title']:
sheet.write(row_number, 5, '包', cell_format)
elif '鞋' in product['product_title']:
sheet.write(row_number, 5, '鞋', cell_format)
elif '带' in product['product_title']:
sheet.write(row_number, 5, '腰带', cell_format)
elif '帽' in product['product_title']:
continue
else:
sheet.write(row_number, 5, '衣服', cell_format)
row_number = row_number + 1
download_type_number[download_type] = row_number
print(download_type + ': 写入EXCEL成功')
return book
def write_excel(download_type, book):
# '''
# 读取变量
import hashlib
products_set = set()
it_goods_lists = pickle.load(open(work_path + download_type + '.txt', 'rb'))
it_goods_list = []
for product in it_goods_lists:
hash_title = hashlib.md5(product['product_url'].encode(encoding='UTF-8')).hexdigest()
if hash_title in products_set:
continue
products_set.add(hash_title)
it_goods_list.append(product)
print(len(it_goods_lists))
print(len(it_goods_list))
sheet = book.add_worksheet(download_type)
# 设置sheet表单元格列宽
sheet.set_column("A:A", 11) # 聚美
sheet.set_column("B:B", 102) # 商品名称
sheet.set_column("C:C", 25.5) # 正面图
sheet.set_column("D:D", 18.25) # 备注
sheet.set_column("E:E", 106) # 聚美优惠价
sheet.set_column("F:F", 18.25) # vip售价
sheet.set_column("G:G", 18) # 最高价
sheet.set_column("H:H", 18) # 商家
sheet.set_column("I:I", 36) # 商家
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, 'yangmatou', cell_format)
sheet.write(0, 1, '商品名称', cell_format)
sheet.write(0, 2, '正面图', cell_format)
sheet.write(0, 3, '优惠价', cell_format)
# sheet.write(0, 4, '原售价', cell_format)
sheet.write(0, 5, '类型', cell_format)
sheet.write(0, 6, 'vip售价', cell_format)
sheet.write(0, 7, '最高价', cell_format)
sheet.write(0, 8, '商家', cell_format)
sheet.write(0, 4, '商品链接', cell_format)
img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.2, # 缩放比例
'y_scale': 0.19}
# 插入爬取it_goods_list信息
row_number = 1
'''
'MinPrice': product['MinPrice'],
'MaxPrice': product['MaxPrice'],
'SellerName': product['SellerName'],
'MinVipPrice': product['MinVipPrice'],
'''
for product in it_goods_list:
if '香水' in product['product_title']:
continue
sheet.set_row(row_number, 145) # 设置第row_number行的高度为52
sheet.write(row_number, 1, product['product_title'], cell_format)
sheet.insert_image(row_number, 2, product['product_img_url_path'], img_format)
# sheet.insert_image(row_number, 2, product['url_path'], img_format)
sheet.write(row_number, 3, str(product['product_price']), cell_format)
# sheet.write(row_number, 4, str(product['product_price_old']), cell_format)
# sheet.write(row_number, 5, str(product['products_info']) + ';' + product['product_state'], cell_format)
sheet.write(row_number, 4, product['product_url'], cell_format)
if product['MinVipPrice'] == 0:
sheet.write(row_number, 6, '无', cell_format)
else:
sheet.write(row_number, 6, product['MinVipPrice'], cell_format)
sheet.write(row_number, 7, product['MaxPrice'], cell_format)
sheet.write(row_number, 8, product['SellerName'], cell_format)
# sheet.write(row_number, 4, product['product_url'], cell_format)
# if '男' in product['product_title']:
# if '钱包' in product['product_title']:
# sheet.write(row_number, 5, '男钱包', cell_format)
# elif '围巾' in product['product_title']:
# sheet.write(row_number, 5, '男围巾', cell_format)
# elif '包' in product['product_title']:
# sheet.write(row_number, 5, '男包', cell_format)
# elif '鞋' in product['product_title']:
# sheet.write(row_number, 5, '男鞋', cell_format)
# elif '带' in product['product_title']:
# sheet.write(row_number, 5, '男腰带', cell_format)
# if '女' in product['product_title']:
# if '钱包' in product['product_title']:
# sheet.write(row_number, 5, '女钱包', cell_format)
# elif '围巾' in product['product_title']:
# sheet.write(row_number, 5, '女围巾', cell_format)
# elif '包' in product['product_title']:
# sheet.write(row_number, 5, '女包', cell_format)
# elif '鞋' in product['product_title']:
# sheet.write(row_number, 5, '女鞋', cell_format)
# elif '带' in product['product_title']:
# sheet.write(row_number, 5, '女腰带', cell_format)
if '钱包' in product['product_title']:
sheet.write(row_number, 5, '钱包', cell_format)
elif '披肩' in product['product_title']:
sheet.write(row_number, 5, '围巾', cell_format)
elif '围巾' in product['product_title']:
sheet.write(row_number, 5, '围巾', cell_format)
elif '包' in product['product_title']:
sheet.write(row_number, 5, '包', cell_format)
elif '鞋' in product['product_title']:
sheet.write(row_number, 5, '鞋', cell_format)
elif '带' in product['product_title']:
sheet.write(row_number, 5, '腰带', cell_format)
elif '帽' in product['product_title']:
continue
else:
sheet.write(row_number, 5, '衣服', cell_format)
row_number = row_number + 1
download_type_number[download_type] = row_number
print(download_type + ': 写入EXCEL成功')
return book
def write_home_page(book):
keys = []
for k in download_type_number.keys():
keys.append(k)
sheet = book.add_worksheet('Home')
# 设置sheet表单元格列宽
sheet.set_column("A:A", 29) # 种类
sheet.set_column("B:B", 8.5) # 商品名称
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
sheet.set_row(0, 22) # 设置第一行的高度为22
# 插入第一行
sheet.write(0, 0, '种类', cell_format)
sheet.write(0, 1, '个数', cell_format)
row_number = 1
all_number = 0
for i in range(0,len(download_type_number)):
sheet.set_row(row_number, 22) # 设置第row_number行的高度为22
sheet.write(row_number, 0, keys[i], cell_format)
all_number = all_number + download_type_number[keys[i]] - 1
sheet.write(row_number, 1, download_type_number[keys[i]] - 1, cell_format) # product['productCode']
row_number = row_number + 1
sheet.write(row_number, 0, '总计', cell_format)
sheet.write(row_number, 1, all_number, cell_format)
# book.close()
print('HomePage' + ': 写入EXCEL成功')
return book
def write_excel_to_path( excel_name):
book = xlsxwriter.Workbook(excel_name)
book = write_excel('yangmatou', book)
write_home_page(book)
book.close()
print(excel_name + ': 写入EXCEL成功')
return None
download_type_number =\
{
}
import requests
class DownImgClass:
"""一个简单的类实例"""
def __init__(self):
self.imgurl_all_list = []
self.it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
def downUrl(self, url, path_name):
r = requests.get(url, headers=self.it_header)
with open(path_name, "wb") as code:
code.write(r.content)
# urllib.request.urlretrieve(url, path_name)
print(path_name + ": success")
return None
def multi_download_img(self):
# '''
self.imgurl_all_list = []
# 读取变量
it_goods_list = pickle.load(open(work_path + 'yangmatou' + '.txt', 'rb'))
# product_info_list = pickle.load(open('./xiaoHong/product_with_shop_names.txt', 'rb'))
print('read')
it_goods_i_id = 0
img_save_path = './yangmatou/img/'
mkdir(img_save_path)
self.product_all_list = []
for product in it_goods_list:
product['product_img_url_path'] = img_save_path + str(it_goods_i_id) + '.jpg'
img_path = img_save_path + str(it_goods_i_id) + '.jpg'
it_goods_i_id = it_goods_i_id + 1
img_url = product['product_img_url']
if img_url == '':
continue
# img_path = product['product_img_url_path']
img_url_dict = {'img_url': img_url,
'img_path': img_path}
self.imgurl_all_list.append(img_url_dict)
self.product_all_list.append(product)
pickle.dump(self.product_all_list, open(work_path + 'yangmatou' + '.txt', 'wb'))
# pickle.dump(self.product_all_list, open('./xiaoHong/red_product_with_image.txt', 'wb'))
print(len(self.imgurl_all_list))
print('down2')
pool = Pool()
pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))])
pool.close()
pool.join()
print('down1')
def second_multi_download_img(self, number):
print(self.imgurl_all_list[number])
img_url = self.imgurl_all_list[number]['img_url']
url_path = self.imgurl_all_list[number]['img_path']
print('down')
try:
self.downUrl(img_url, url_path)
except Exception as e:
print(e)
if __name__ == '__main__':
download_style = 'yangmatou'
path_time = time.strftime("%m_%d")
work_path = './gucci/' + path_time + '/' + download_style + '/'
save_path = './gucci/' + path_time + '/'
mkdir(work_path)
# # 爬信息
# get_product()
# # 写到Excel里
# a = time.time()
# # 获取信息
# c = time.time()
# d = a - c
# # 下载图片
# down_img = DownImgClass()
# down_img.multi_download_img()
# b = time.time()
# # 输出总耗时
# print((b - a) / 60)
# print(d / 60)
write_excel_to_path(save_path + 'yangmatou_excel_' + path_time + '.xlsx')
</code></pre>