python


2、yangmatou

<pre><code>from multiprocessing import Pool import urllib.request import ssl import re import json import time import random import xlsxwriter import os import pickle import socket # 设置超时时间为30s socket.setdefaulttimeout(30) ssl._create_default_https_context = ssl._create_unverified_context def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False # 返回html的soup解析 def open_url(url): it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) # Soup = BeautifulSoup(html, 'lxml') return html url_i = 'https://www.ymatou.com/sellerhome/api/SellerProduct?callback=' \ 'jQuery33107037562205245479_1552098836879&amp;sellerId=21948758&amp;pageIndex=' url_o = '&amp;brandId=10134&amp;categoryId=0&amp;priceSort=&amp;hasCoupon=1&amp;pageStart=' url_e = '&amp;_=1552098836885' url2 = 'https://www.ymatou.com/sellerhome/api/SellerProduct?callback=' \ 'jQuery33107037562205245479_1552098836879&amp;sellerId=21948758&amp;pageIndex=' \ '3&amp;brandId=10134&amp;categoryId=0&amp;priceSort=&amp;hasCoupon=1&amp;pageStart=80&amp;_=1552098836886' def get_product(): product_infos = [] url = url_i + str(1) + url_o + str(0) + url_e product_json = open_url(url) result = re.search('\{\"Code.*?(.*?).*?\}\]\}\}\}', product_json) data = result.group() json_data = json.loads(data) product_list = json_data['Data']['ProductInfo']['ProductList'] product_total = json_data['Data']['ProductInfo']['Total'] page_nums = product_total//40 + 1 for product in product_list: product_info = \ { 'product_img_url': product['PicUrl'], 'TaxFarming': product['TaxFarming'], 'FreeShipping': product['FreeShipping'], 'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html', 'product_title': product['Title'], 'product_price': product['Price'] } product_infos.append(product_info) # print(product['Title']) for i in range(2, page_nums + 1): url = url_i + str(i) + url_o + str((i-1)*40) + url_e product_json = open_url(url) result = re.search('\{\"Code.*?(.*?).*?\}\]\}\}\}', product_json) data = result.group() json_data = json.loads(data) product_list = json_data['Data']['ProductInfo']['ProductList'] product_total = json_data['Data']['ProductInfo']['Total'] page_nums = product_total / 40 + 1 for product in product_list: product_info = \ { 'product_img_url': product['PicUrl'], 'TaxFarming': product['TaxFarming'], 'FreeShipping': product['FreeShipping'], 'product_url': 'https://www.ymatou.com/product/' + product['ProductId'] + '.html', 'product_title': product['Title'], 'product_price': product['Price'] } product_infos.append(product_info) pickle.dump(product_infos, open(work_path + 'yangmatou' + '.txt', 'wb')) print(': success') def write_excel(download_type, book): # ''' # 读取变量 it_goods_list = pickle.load(open(work_path + download_type + '.txt', 'rb')) sheet = book.add_worksheet(download_type) # 设置sheet表单元格列宽 sheet.set_column("A:A", 11) # 聚美 sheet.set_column("B:B", 102) # 商品名称 sheet.set_column("C:C", 20.5) # 正面图 sheet.set_column("D:D", 18.25) # 备注 sheet.set_column("E:E", 106) # 聚美优惠价 # sheet.set_column("F:F", 12) # v原售价 # sheet.set_column("G:G", 106) # 商品链接 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) # 设置sheet表单元格行高 sheet.set_row(0, 22) # 设置第一行的高度为22 # 在向单元格中写入内容时,加上单元格样式 # 插入第一行 sheet.write(0, 0, 'yangmatou', cell_format) sheet.write(0, 1, '商品名称', cell_format) sheet.write(0, 2, '正面图', cell_format) sheet.write(0, 3, '优惠价', cell_format) # sheet.write(0, 4, '原售价', cell_format) sheet.write(0, 5, '类型', cell_format) sheet.write(0, 4, '商品链接', cell_format) img_format = {'x_offset': 4, # 左右移动 'y_offset': 0, 'x_scale': 0.2, # 缩放比例 'y_scale': 0.19} # 插入爬取it_goods_list信息 row_number = 1 for product in it_goods_list: sheet.set_row(row_number, 102) # 设置第row_number行的高度为52 sheet.write(row_number, 1, product['product_title'], cell_format) sheet.insert_image(row_number, 2, product['product_img_url_path'], img_format) sheet.write(row_number, 3, str(product['product_price']), cell_format) # sheet.write(row_number, 4, str(product['product_price_old']), cell_format) # sheet.write(row_number, 5, str(product['products_info']) + ';' + product['product_state'], cell_format) sheet.write(row_number, 4, product['product_url'], cell_format) # if '男' in product['product_title']: # if '钱包' in product['product_title']: # sheet.write(row_number, 5, '男钱包', cell_format) # elif '围巾' in product['product_title']: # sheet.write(row_number, 5, '男围巾', cell_format) # elif '包' in product['product_title']: # sheet.write(row_number, 5, '男包', cell_format) # elif '鞋' in product['product_title']: # sheet.write(row_number, 5, '男鞋', cell_format) # elif '带' in product['product_title']: # sheet.write(row_number, 5, '男腰带', cell_format) # if '女' in product['product_title']: # if '钱包' in product['product_title']: # sheet.write(row_number, 5, '女钱包', cell_format) # elif '围巾' in product['product_title']: # sheet.write(row_number, 5, '女围巾', cell_format) # elif '包' in product['product_title']: # sheet.write(row_number, 5, '女包', cell_format) # elif '鞋' in product['product_title']: # sheet.write(row_number, 5, '女鞋', cell_format) # elif '带' in product['product_title']: # sheet.write(row_number, 5, '女腰带', cell_format) if '钱包' in product['product_title']: sheet.write(row_number, 5, '钱包', cell_format) elif '披肩' in product['product_title']: sheet.write(row_number, 5, '围巾', cell_format) elif '围巾' in product['product_title']: sheet.write(row_number, 5, '围巾', cell_format) elif '包' in product['product_title']: sheet.write(row_number, 5, '包', cell_format) elif '鞋' in product['product_title']: sheet.write(row_number, 5, '鞋', cell_format) elif '带' in product['product_title']: sheet.write(row_number, 5, '腰带', cell_format) elif '帽' in product['product_title']: continue else: sheet.write(row_number, 5, '衣服', cell_format) row_number = row_number + 1 download_type_number[download_type] = row_number print(download_type + ': 写入EXCEL成功') return book def write_home_page(book): keys = [] for k in download_type_number.keys(): keys.append(k) sheet = book.add_worksheet('Home') # 设置sheet表单元格列宽 sheet.set_column("A:A", 29) # 种类 sheet.set_column("B:B", 8.5) # 商品名称 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) sheet.set_row(0, 22) # 设置第一行的高度为22 # 插入第一行 sheet.write(0, 0, '种类', cell_format) sheet.write(0, 1, '个数', cell_format) row_number = 1 all_number = 0 for i in range(0,len(download_type_number)): sheet.set_row(row_number, 22) # 设置第row_number行的高度为22 sheet.write(row_number, 0, keys[i], cell_format) all_number = all_number + download_type_number[keys[i]] - 1 sheet.write(row_number, 1, download_type_number[keys[i]] - 1, cell_format) # product['productCode'] row_number = row_number + 1 sheet.write(row_number, 0, '总计', cell_format) sheet.write(row_number, 1, all_number, cell_format) # book.close() print('HomePage' + ': 写入EXCEL成功') return book def write_excel_to_path( excel_name): book = xlsxwriter.Workbook(excel_name) book = write_excel('yangmatou', book) write_home_page(book) book.close() print(excel_name + ': 写入EXCEL成功') return None download_type_number =\ { } import requests class DownImgClass: """一个简单的类实例""" def __init__(self): self.imgurl_all_list = [] self.it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} def downUrl(self, url, path_name): r = requests.get(url, headers=self.it_header) with open(path_name, "wb") as code: code.write(r.content) # urllib.request.urlretrieve(url, path_name) print(path_name + ": success") return None def multi_download_img(self): # ''' self.imgurl_all_list = [] # 读取变量 it_goods_list = pickle.load(open(work_path + 'yangmatou' + '.txt', 'rb')) # product_info_list = pickle.load(open('./xiaoHong/product_with_shop_names.txt', 'rb')) print('read') it_goods_i_id = 0 img_save_path = './yangmatou/img/' mkdir(img_save_path) self.product_all_list = [] for product in it_goods_list: product['product_img_url_path'] = img_save_path + str(it_goods_i_id) + '.jpg' img_path = img_save_path + str(it_goods_i_id) + '.jpg' it_goods_i_id = it_goods_i_id + 1 img_url = product['product_img_url'] if img_url == '': continue # img_path = product['product_img_url_path'] img_url_dict = {'img_url': img_url, 'img_path': img_path} self.imgurl_all_list.append(img_url_dict) self.product_all_list.append(product) pickle.dump(self.product_all_list, open(work_path + 'yangmatou' + '.txt', 'wb')) # pickle.dump(self.product_all_list, open('./xiaoHong/red_product_with_image.txt', 'wb')) print(len(self.imgurl_all_list)) print('down2') pool = Pool() pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))]) pool.close() pool.join() print('down1') def second_multi_download_img(self, number): print(self.imgurl_all_list[number]) img_url = self.imgurl_all_list[number]['img_url'] url_path = self.imgurl_all_list[number]['img_path'] print('down') try: self.downUrl(img_url, url_path) except Exception as e: print(e) if __name__ == '__main__': download_style = 'yangmatou' path_time = time.strftime("%m_%d") work_path = './gucci/' + path_time + '/' + download_style + '/' save_path = './gucci/' + path_time + '/' mkdir(work_path) # # 爬信息 # get_product() # # 写到Excel里 # a = time.time() # # 获取信息 # c = time.time() # d = a - c # # 下载图片 # down_img = DownImgClass() # down_img.multi_download_img() # b = time.time() # # 输出总耗时 # print((b - a) / 60) # print(d / 60) write_excel_to_path(save_path + 'yangmatou_excel_' + path_time + '.xlsx') </code></pre>

页面列表

ITEM_HTML