python


3、xiaohongshu

<pre><code>from bs4 import BeautifulSoup from selenium import webdriver import time import hashlib import os import pickle import random import xlsxwriter import urllib.request from multiprocessing import Pool import socket import json import re socket.setdefaulttimeout(5) def mkdir(path): path = path.strip() path = path.rstrip("\\") isExists = os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print(path + ' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path + ' 目录已存在') return False def get_html(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) # Soup = BeautifulSoup(html, 'lxml') return html browser = webdriver.Chrome() browser.get('https://www.xiaohongshu.com/page/brands/5a43848e8000862471d15040?' 'openPage=yes&amp;xhs_g_s=0066&amp;banner_id=5aa74c7e6d0bd31b991b48c3&amp;xhs_' 'channel=0090_0090_0090_0066&amp;naviHidden=yes&amp;tab=goods&amp;goods_id=5965c' '1a170e75226a4192989&amp;_at=52f170df4d108064a431fccee7464b20fa454') page_source = '' # 滑动页面,直到所有信息加载完毕 ii = 300 product_infos = [] products_set = set() for j in range(1,750): js="var q=document.documentElement.scrollTop="+str(ii) browser.execute_script(js) ii = ii+800 page_source = browser.page_source red_soup = BeautifulSoup(page_source, 'lxml') # product_titles = red_soup.find_all(class_='cube-goods-card__top') product_ids = red_soup.find_all(class_='good cube-goods-card') for i in range(0, len(product_ids)): try: product_img_url = \ product_ids[i](class_='cube-item-image-container cube-goods-card__img cube-image normal-image')[ 0].img['src'] product_id = product_ids[i]['data-id'] # 1691 product_title_1 = product_ids[i](class_='cube-goods-card__top')[0].h4.text # 1697 product_title_2 = product_ids[i](class_='cube-goods-card__top')[0].span.text hash_title = hashlib.md5((product_title_1 + product_title_2).encode(encoding='UTF-8')).hexdigest() if hash_title in products_set: continue products_set.add(hash_title) # product_price_1 = product_ids[i](class_='cube-goods-card__center-left')[0].text product_price = product_ids[i](class_='cube-goods-card__center-left')[0]( class_='cube-price --sale --icon-size-m --size-m --color-red --weight-medium --decoration-')[0].text try: product_price_2 = product_ids[i](class_='cube-goods-card__center-left')[0]( class_='cube-price --sale --icon-size-xs --size-xs --color-grey --weight-medium --decoration-line-through')[ 0].text except Exception as e: product_price_2 = '' url_i = 'https://pages.xiaohongshu.com/goods/' url_o = '?xhs_g_s=0094&amp;banner_id=5aa74c7e6d0bd31b991b48c3&amp;xhs_channel=0090_0090_0066_0094&amp;naviHidden=yes&amp;openPage=yes' product_info = \ { 'product_img_url': product_img_url, 'product_url': url_i + product_id + url_o, 'product_title': product_title_2, 'product_title_1': product_title_1, 'product_price_2': product_price_2, 'product_price': product_price } product_infos.append(product_info) except Exception as e: print(i) print(e) continue time.sleep(1) #j = j+ 1 product_with_shop_names = [] for product in product_infos: soup = get_html(product['product_url']) result = re.search('\{\"Main.*?(.*?).*?\}\}\}', soup) result = json.loads(result.group()) result_shop_name = result['Main']['basicData']['items'][0]['seller']['name'] product['shop_name'] = result_shop_name product_with_shop_names.append(product) pickle.dump(product_with_shop_names, open( 'product_with_shop_names.txt', 'wb')) # import urllib.request # it_goods_i_id = 0 # img_save_path = './xiaoHong/img/' # mkdir(img_save_path) # product_list = [] # for product in product_with_shop_names: # product['product_img_url_path'] = img_save_path + str(it_goods_i_id) + '.jpg' # product_list.append(product) # it_goods_i_id = it_goods_i_id + 1 download_type_number =\ { 'product_women_bag_infos':0, 'product_men_bag_infos':0, 'product_bag_infos':0, 'product_shoes_infos':0, 'product_accessories_infos':0 } def secoo_write_excel(product_infos,download_type, book): # ''' # 读取变量 # it_goods_list = product_infos sheet = book.add_worksheet(download_type) # 设置sheet表单元格列宽 sheet.set_column("A:A", 5) # 寺库 sheet.set_column("B:B", 112.88) # 商品名称 sheet.set_column("C:C", 10.5) # 正面图 sheet.set_column("D:D", 22.38) # 货号 sheet.set_column("E:E", 22.38) # 货号 sheet.set_column("F:F", 191) # 零售价 sheet.set_column("G:G", 32) # 零售价 sheet.set_column("H:H", 22) # 类型 sheet.set_column("I:I", 22) # 类型 # sheet.set_column("F:F", 82) # 商品链接 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) # 设置sheet表单元格行高 sheet.set_row(0, 22) # 设置第一行的高度为22 # 在向单元格中写入内容时,加上单元格样式 # 插入第一行 sheet.write(0, 0, 'RED', cell_format) sheet.write(0, 1, '商品名称', cell_format) sheet.write(0, 2, '正面图', cell_format) sheet.write(0, 3, '优惠价', cell_format) sheet.write(0, 4, '原售价', cell_format) sheet.write(0, 5, '商品链接', cell_format) sheet.write(0, 6, '一级标题', cell_format) sheet.write(0, 7, '类型', cell_format) sheet.write(0, 8, '店铺', cell_format) img_format = {'x_offset': 4, # 左右移动 'y_offset': 0, 'x_scale': 0.2, # 缩放比例 'y_scale': 0.19} # 插入爬取it_goods_list信息 row_number = 1 for product in product_infos: if '新草' in product['product_title']: continue if '香水' in product['product_title']: continue if '太阳' in product['product_title']: continue if '镜' in product['product_title']: continue sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 1, product['product_title'], cell_format) sheet.insert_image(row_number, 2, product['product_img_url_path'], img_format) # sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode'] sheet.write(row_number, 3, str(product['product_price']), cell_format) sheet.write(row_number, 4, str(product['product_price_2']), cell_format) sheet.write(row_number, 5, product['product_url'], cell_format) sheet.write(row_number, 6, product['product_title_1'], cell_format) sheet.write(row_number, 8, product['shop_name'], cell_format) style = '衣服' if '包' in product['product_title']: if '女' in product['product_title']: style = '女包' else: style = '男包' if '鞋' in product['product_title']: if '女' in product['product_title']: style = '女鞋' else: style = '男鞋' if '腰带' in product['product_title']: if '女' in product['product_title']: style = '女腰带' else: style = '男腰带' if '围巾' in product['product_title']: if '女' in product['product_title']: style = '女围巾' else: style = '男围巾' sheet.write(row_number, 7, style, cell_format) row_number = row_number + 1 download_type_number[download_type] = row_number print(download_type + ': 写入EXCEL成功') return book def secoo_write_excel_to_path(product_infos,excel_name): book = xlsxwriter.Workbook(excel_name) book = secoo_write_excel(product_infos,'all', book) book.close() print(excel_name + ': 写入EXCEL成功') return None class DownImgClass: """一个简单的类实例""" def __init__(self): self.imgurl_all_list = [] def downUrl(self, url, path_name): urllib.request.urlretrieve(url, path_name) print(path_name + ": success") return None def multi_download_img(self): # ''' self.imgurl_all_list = [] # 读取变量 product_info_list = pickle.load(open('./xiaoHong/product_with_shop_names.txt', 'rb')) print('read') it_goods_i_id = 0 img_save_path = './xiaoHong/img/' mkdir(img_save_path) self.product_all_list = [] for product in product_info_list: product['product_img_url_path'] = img_save_path + str(it_goods_i_id) + '.jpg' img_path = img_save_path + str(it_goods_i_id) + '.jpg' it_goods_i_id = it_goods_i_id + 1 img_url = product['product_img_url'] if img_url == '': continue # img_path = product['product_img_url_path'] img_url_dict = {'img_url': img_url, 'img_path': img_path} self.imgurl_all_list.append(img_url_dict) self.product_all_list.append(product) pickle.dump(self.product_all_list, open('./xiaoHong/red_product_with_image.txt', 'wb')) print(len(self.imgurl_all_list)) print('down2') pool = Pool() pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))]) pool.close() pool.join() print('down1') def second_multi_download_img(self, number): print(self.imgurl_all_list[number]) img_url = self.imgurl_all_list[number]['img_url'] url_path = self.imgurl_all_list[number]['img_path'] print('down') try: self.downUrl(img_url, url_path) except Exception as e: print(e) if __name__=='__main__': a = time.time() # 获取信息 c = time.time() d = a - c # 下载图片 down_img = DownImgClass() down_img.multi_download_img() b = time.time() # 输出总耗时 print((b - a) / 60) print(d / 60) download_style = 'xiaohongshu' path_time = time.strftime("%m_%d") work_path = './gucci/' + path_time + '/' + download_style + '/' save_path = './gucci/' + path_time + '/' mkdir(work_path) # 获取信息 product_info_list = pickle.load(open('./xiaoHong/red_product_with_image.txt', 'rb')) secoo_write_excel_to_path(product_info_list, save_path + 'red_pages' + path_time + '.xlsx') </code></pre>

页面列表

ITEM_HTML