11、secoo_all
<pre><code>from bs4 import BeautifulSoup
import time
import random
import xlsxwriter
import re
import os
import pickle
from selenium import webdriver
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def get_product_infos(style):
product_infos = []
url_i, url_o = all_inf_url[style+'_url_i'], all_inf_url[style+'_url_o']
# 打开第一页,提取页码
url = url_i + str(1) + url_o
browser.get(url)
time.sleep(random.randint(2, 5))
page_html = browser.page_source
Soup = BeautifulSoup(page_html, 'lxml')
product_page_nums = Soup.find_all(class_="product_control_page")
str1 = product_page_nums[0].text.replace('\n', '')
page_num = str1[str1.index('/') + 1:].replace(' ', '')
product_titles = Soup.find_all(class_="dl_name")
product_show_tips = Soup.find_all(class_="show_tips")
product_prices = Soup.find_all(class_="dl_price clearfix")
for i in range(0, len(product_prices)):
product_img_url = product_show_tips[i].dt.img['data-original']
product_url = product_titles[i].a['href']
product_title = product_titles[i].a['title']
product_price = product_prices[i].text[1:]
product_info = \
{
'product_img_url': product_img_url,
'product_url': product_url,
'product_title': product_title,
'product_price': product_price
}
product_infos.append(product_info)
# 遍历全部页码
for j in range(2, int(page_num)+1):
#url_i = 'http://search.secoo.com/search?keyword=Gucci&firstcategoryid=30&secondcategoryid=0&thirdcategoryid=0&brandId=0&level=0&orderType=1&filterType=0&source=&pageNo='
#url_o = '&st=10&price=0&prop=0&warehouse=100&actscr=0&expKey=#J_Filter'
url = url_i + str(j) + url_o
browser.get(url)
time.sleep(random.randint(2, 5))
page_html = browser.page_source
Soup = BeautifulSoup(page_html, 'lxml')
product_titles = Soup.find_all(class_="dl_name")
product_show_tips = Soup.find_all(class_="show_tips")
product_prices = Soup.find_all(class_="dl_price clearfix")
for i in range(0, len(product_prices)):
product_img_url = product_show_tips[i].dt.img['data-original']
product_url = product_titles[i].a['href']
product_title = product_titles[i].a['title']
product_price = product_prices[i].text[1:]
product_info = \
{
'product_img_url': product_img_url,
'product_url': product_url,
'product_title': product_title,
'product_price': product_price
}
product_infos.append(product_info)
mkdir('./secoo/')
pickle.dump(product_infos, open('./secoo/' + style + '.txt', 'wb'))
print(style + ': success')
def secoo_write_excel(download_type, book):
# '''
# 读取变量
it_goods_list = pickle.load(open('./secoo/' + download_type + '.txt', 'rb'))
sheet = book.add_worksheet(download_type)
# 设置sheet表单元格列宽
sheet.set_column("A:A", 5) # 寺库
sheet.set_column("B:B", 112.88) # 商品名称
sheet.set_column("C:C", 10.5) # 正面图
sheet.set_column("D:D", 12.38) # 货号
sheet.set_column("E:E", 82) # 零售价
# sheet.set_column("F:F", 82) # 商品链接
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, '寺库', cell_format)
sheet.write(0, 1, '商品名称', cell_format)
sheet.write(0, 2, '正面图', cell_format)
sheet.write(0, 3, '零售价', cell_format)
sheet.write(0, 4, '商品链接', cell_format)
img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.2, # 缩放比例
'y_scale': 0.19}
# 插入爬取it_goods_list信息
row_number = 1
for product in it_goods_list:
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 1, product['product_title'], cell_format)
# sheet.insert_image(row_number, 2, product['url_path'], img_format)
# sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode']
sheet.write(row_number, 3, str(product['product_price']), cell_format)
sheet.write(row_number, 4, product['product_url'], cell_format)
row_number = row_number + 1
download_type_number[download_type] = row_number
print(download_type + ': 写入EXCEL成功')
return book
def write_home_page(book):
keys = []
for k in download_type_number.keys():
keys.append(k)
sheet = book.add_worksheet('Home')
# 设置sheet表单元格列宽
sheet.set_column("A:A", 29) # 种类
sheet.set_column("B:B", 8.5) # 商品名称
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
sheet.set_row(0, 22) # 设置第一行的高度为22
# 插入第一行
sheet.write(0, 0, '种类', cell_format)
sheet.write(0, 1, '个数', cell_format)
row_number = 1
all_number = 0
for i in range(0,len(download_type_number)):
sheet.set_row(row_number, 22) # 设置第row_number行的高度为22
sheet.write(row_number, 0, keys[i], cell_format)
all_number = all_number + download_type_number[keys[i]] - 1
sheet.write(row_number, 1, download_type_number[keys[i]] - 1, cell_format) # product['productCode']
row_number = row_number + 1
sheet.write(row_number, 0, '总计', cell_format)
sheet.write(row_number, 1, all_number, cell_format)
# book.close()
print('HomePage' + ': 写入EXCEL成功')
return book
def secoo_write_excel_to_path(secoo_all, excel_name):
book = xlsxwriter.Workbook(excel_name)
for i in range(0, len(secoo_all)):
print(secoo_all[i])
book = secoo_write_excel(secoo_all[i], book)
write_home_page(book)
book.close()
print(excel_name + ': 写入EXCEL成功')
return None
download_type_number =\
{
'belts':0,
'scarves':0,
'men_bags':0,
'women_bags':0,
'bags':0
}
secoo_all = ['belts', 'scarves','men_bags','women_bags','bags',
'men_clothes', 'women_clothes',
'men_shoes','women_shoes']
belts_url_i = 'http://list.secoo.com/accessories/857-63-0-5-0-1-0-0-'
belts_url_o = '-10-0-0-100-0.shtml#J_FilterPos'
scarves_url_i = 'http://list.secoo.com/accessories/857-1790-0-5-0-1-0-0-'
scarves_url_o = '-10-0-0-100-0.shtml#J_FilterPos'
men_clothes_url_i = 'http://list.secoo.com/undefined/1660-0-0-5-0-1-0-0-'
men_clothes_url_o = '-10-0-0-100-0.shtml#J_FilterPos'
women_clothes_url_i = 'http://list.secoo.com/undefined/1690-0-0-5-0-1-0-0-'
women_clothes_url_o = '-10-0-0-100-0.shtml#J_FilterPos'
men_shoes_url_i = 'http://list.secoo.com/undefined/1555-0-0-5-0-1-0-0-'
men_shoes_url_o = '-10-0-0-100-0.shtml#J_FilterPos'
women_shoes_url_i = 'http://list.secoo.com/undefined/1554-0-0-5-0-1-0-0-'
women_shoes_url_o = '-10-0-0-100-0.shtml#J_FilterPos'
men_bags_url_i = 'http://list.secoo.com/bags/30-0-0-5-0-1-0-0-'
men_bags_url_o = '-10-0-877_0-100-0.shtml#J_FilterPos'
women_bags_url_i = 'http://list.secoo.com/bags/30-0-0-5-0-1-0-0-'
women_bags_url_o = '-10-0-877_1-100-0.shtml#J_FilterPos'
bags_url_i = 'http://list.secoo.com/bags/30-0-0-5-0-1-0-0-'
bags_url_o = '-10-0-877_2-100-0.shtml#J_FilterPos'
all_inf_url = {
'belts_url_i': belts_url_i,
'belts_url_o': belts_url_o,
'scarves_url_i': scarves_url_i,
'scarves_url_o': scarves_url_o,
'men_clothes_url_i': men_clothes_url_i,
'men_clothes_url_o': men_clothes_url_o,
'women_clothes_url_i': women_clothes_url_i,
'women_clothes_url_o': women_clothes_url_o,
'men_shoes_url_i': men_shoes_url_i,
'men_shoes_url_o': men_shoes_url_o,
'women_shoes_url_i': women_shoes_url_i,
'women_shoes_url_o': women_shoes_url_o,
'men_bags_url_i': men_bags_url_i,
'men_bags_url_o': men_bags_url_o,
'women_bags_url_i': women_bags_url_i,
'women_bags_url_o': women_bags_url_o,
'bags_url_i': bags_url_i,
'bags_url_o': bags_url_o
}
browser = webdriver.Chrome()
for i in range(4,len(secoo_all)):
get_product_infos(secoo_all[i])
secoo_write_excel_to_path(secoo_all, 'secoo_excel-23.xlsx')</code></pre>