python


11、secoo_all

<pre><code>from bs4 import BeautifulSoup import time import random import xlsxwriter import re import os import pickle from selenium import webdriver def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False def get_product_infos(style): product_infos = [] url_i, url_o = all_inf_url[style+'_url_i'], all_inf_url[style+'_url_o'] # 打开第一页,提取页码 url = url_i + str(1) + url_o browser.get(url) time.sleep(random.randint(2, 5)) page_html = browser.page_source Soup = BeautifulSoup(page_html, 'lxml') product_page_nums = Soup.find_all(class_="product_control_page") str1 = product_page_nums[0].text.replace('\n', '') page_num = str1[str1.index('/') + 1:].replace(' ', '') product_titles = Soup.find_all(class_="dl_name") product_show_tips = Soup.find_all(class_="show_tips") product_prices = Soup.find_all(class_="dl_price clearfix") for i in range(0, len(product_prices)): product_img_url = product_show_tips[i].dt.img['data-original'] product_url = product_titles[i].a['href'] product_title = product_titles[i].a['title'] product_price = product_prices[i].text[1:] product_info = \ { 'product_img_url': product_img_url, 'product_url': product_url, 'product_title': product_title, 'product_price': product_price } product_infos.append(product_info) # 遍历全部页码 for j in range(2, int(page_num)+1): #url_i = 'http://search.secoo.com/search?keyword=Gucci&amp;firstcategoryid=30&amp;secondcategoryid=0&amp;thirdcategoryid=0&amp;brandId=0&amp;level=0&amp;orderType=1&amp;filterType=0&amp;source=&amp;pageNo=' #url_o = '&amp;st=10&amp;price=0&amp;prop=0&amp;warehouse=100&amp;actscr=0&amp;expKey=#J_Filter' url = url_i + str(j) + url_o browser.get(url) time.sleep(random.randint(2, 5)) page_html = browser.page_source Soup = BeautifulSoup(page_html, 'lxml') product_titles = Soup.find_all(class_="dl_name") product_show_tips = Soup.find_all(class_="show_tips") product_prices = Soup.find_all(class_="dl_price clearfix") for i in range(0, len(product_prices)): product_img_url = product_show_tips[i].dt.img['data-original'] product_url = product_titles[i].a['href'] product_title = product_titles[i].a['title'] product_price = product_prices[i].text[1:] product_info = \ { 'product_img_url': product_img_url, 'product_url': product_url, 'product_title': product_title, 'product_price': product_price } product_infos.append(product_info) mkdir('./secoo/') pickle.dump(product_infos, open('./secoo/' + style + '.txt', 'wb')) print(style + ': success') def secoo_write_excel(download_type, book): # ''' # 读取变量 it_goods_list = pickle.load(open('./secoo/' + download_type + '.txt', 'rb')) sheet = book.add_worksheet(download_type) # 设置sheet表单元格列宽 sheet.set_column("A:A", 5) # 寺库 sheet.set_column("B:B", 112.88) # 商品名称 sheet.set_column("C:C", 10.5) # 正面图 sheet.set_column("D:D", 12.38) # 货号 sheet.set_column("E:E", 82) # 零售价 # sheet.set_column("F:F", 82) # 商品链接 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) # 设置sheet表单元格行高 sheet.set_row(0, 22) # 设置第一行的高度为22 # 在向单元格中写入内容时,加上单元格样式 # 插入第一行 sheet.write(0, 0, '寺库', cell_format) sheet.write(0, 1, '商品名称', cell_format) sheet.write(0, 2, '正面图', cell_format) sheet.write(0, 3, '零售价', cell_format) sheet.write(0, 4, '商品链接', cell_format) img_format = {'x_offset': 4, # 左右移动 'y_offset': 0, 'x_scale': 0.2, # 缩放比例 'y_scale': 0.19} # 插入爬取it_goods_list信息 row_number = 1 for product in it_goods_list: sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 1, product['product_title'], cell_format) # sheet.insert_image(row_number, 2, product['url_path'], img_format) # sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode'] sheet.write(row_number, 3, str(product['product_price']), cell_format) sheet.write(row_number, 4, product['product_url'], cell_format) row_number = row_number + 1 download_type_number[download_type] = row_number print(download_type + ': 写入EXCEL成功') return book def write_home_page(book): keys = [] for k in download_type_number.keys(): keys.append(k) sheet = book.add_worksheet('Home') # 设置sheet表单元格列宽 sheet.set_column("A:A", 29) # 种类 sheet.set_column("B:B", 8.5) # 商品名称 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) sheet.set_row(0, 22) # 设置第一行的高度为22 # 插入第一行 sheet.write(0, 0, '种类', cell_format) sheet.write(0, 1, '个数', cell_format) row_number = 1 all_number = 0 for i in range(0,len(download_type_number)): sheet.set_row(row_number, 22) # 设置第row_number行的高度为22 sheet.write(row_number, 0, keys[i], cell_format) all_number = all_number + download_type_number[keys[i]] - 1 sheet.write(row_number, 1, download_type_number[keys[i]] - 1, cell_format) # product['productCode'] row_number = row_number + 1 sheet.write(row_number, 0, '总计', cell_format) sheet.write(row_number, 1, all_number, cell_format) # book.close() print('HomePage' + ': 写入EXCEL成功') return book def secoo_write_excel_to_path(secoo_all, excel_name): book = xlsxwriter.Workbook(excel_name) for i in range(0, len(secoo_all)): print(secoo_all[i]) book = secoo_write_excel(secoo_all[i], book) write_home_page(book) book.close() print(excel_name + ': 写入EXCEL成功') return None download_type_number =\ { 'belts':0, 'scarves':0, 'men_bags':0, 'women_bags':0, 'bags':0 } secoo_all = ['belts', 'scarves','men_bags','women_bags','bags', 'men_clothes', 'women_clothes', 'men_shoes','women_shoes'] belts_url_i = 'http://list.secoo.com/accessories/857-63-0-5-0-1-0-0-' belts_url_o = '-10-0-0-100-0.shtml#J_FilterPos' scarves_url_i = 'http://list.secoo.com/accessories/857-1790-0-5-0-1-0-0-' scarves_url_o = '-10-0-0-100-0.shtml#J_FilterPos' men_clothes_url_i = 'http://list.secoo.com/undefined/1660-0-0-5-0-1-0-0-' men_clothes_url_o = '-10-0-0-100-0.shtml#J_FilterPos' women_clothes_url_i = 'http://list.secoo.com/undefined/1690-0-0-5-0-1-0-0-' women_clothes_url_o = '-10-0-0-100-0.shtml#J_FilterPos' men_shoes_url_i = 'http://list.secoo.com/undefined/1555-0-0-5-0-1-0-0-' men_shoes_url_o = '-10-0-0-100-0.shtml#J_FilterPos' women_shoes_url_i = 'http://list.secoo.com/undefined/1554-0-0-5-0-1-0-0-' women_shoes_url_o = '-10-0-0-100-0.shtml#J_FilterPos' men_bags_url_i = 'http://list.secoo.com/bags/30-0-0-5-0-1-0-0-' men_bags_url_o = '-10-0-877_0-100-0.shtml#J_FilterPos' women_bags_url_i = 'http://list.secoo.com/bags/30-0-0-5-0-1-0-0-' women_bags_url_o = '-10-0-877_1-100-0.shtml#J_FilterPos' bags_url_i = 'http://list.secoo.com/bags/30-0-0-5-0-1-0-0-' bags_url_o = '-10-0-877_2-100-0.shtml#J_FilterPos' all_inf_url = { 'belts_url_i': belts_url_i, 'belts_url_o': belts_url_o, 'scarves_url_i': scarves_url_i, 'scarves_url_o': scarves_url_o, 'men_clothes_url_i': men_clothes_url_i, 'men_clothes_url_o': men_clothes_url_o, 'women_clothes_url_i': women_clothes_url_i, 'women_clothes_url_o': women_clothes_url_o, 'men_shoes_url_i': men_shoes_url_i, 'men_shoes_url_o': men_shoes_url_o, 'women_shoes_url_i': women_shoes_url_i, 'women_shoes_url_o': women_shoes_url_o, 'men_bags_url_i': men_bags_url_i, 'men_bags_url_o': men_bags_url_o, 'women_bags_url_i': women_bags_url_i, 'women_bags_url_o': women_bags_url_o, 'bags_url_i': bags_url_i, 'bags_url_o': bags_url_o } browser = webdriver.Chrome() for i in range(4,len(secoo_all)): get_product_infos(secoo_all[i]) secoo_write_excel_to_path(secoo_all, 'secoo_excel-23.xlsx')</code></pre>

页面列表

ITEM_HTML