3、xiaohongshu
<pre><code>from bs4 import BeautifulSoup
from selenium import webdriver
import time
import hashlib
import os
import pickle
import random
import xlsxwriter
import urllib.request
from multiprocessing import Pool
import socket
import json
import re
socket.setdefaulttimeout(5)
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def get_html(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
# Soup = BeautifulSoup(html, 'lxml')
return html
browser = webdriver.Chrome()
browser.get('https://www.xiaohongshu.com/page/brands/5a43848e8000862471d15040?'
'openPage=yes&xhs_g_s=0066&banner_id=5aa74c7e6d0bd31b991b48c3&xhs_'
'channel=0090_0090_0090_0066&naviHidden=yes&tab=goods&goods_id=5965c'
'1a170e75226a4192989&_at=52f170df4d108064a431fccee7464b20fa454')
page_source = ''
# 滑动页面,直到所有信息加载完毕
ii = 300
product_infos = []
products_set = set()
for j in range(1,750):
js="var q=document.documentElement.scrollTop="+str(ii)
browser.execute_script(js)
ii = ii+800
page_source = browser.page_source
red_soup = BeautifulSoup(page_source, 'lxml')
# product_titles = red_soup.find_all(class_='cube-goods-card__top')
product_ids = red_soup.find_all(class_='good cube-goods-card')
for i in range(0, len(product_ids)):
try:
product_img_url = \
product_ids[i](class_='cube-item-image-container cube-goods-card__img cube-image normal-image')[
0].img['src']
product_id = product_ids[i]['data-id'] # 1691
product_title_1 = product_ids[i](class_='cube-goods-card__top')[0].h4.text # 1697
product_title_2 = product_ids[i](class_='cube-goods-card__top')[0].span.text
hash_title = hashlib.md5((product_title_1 + product_title_2).encode(encoding='UTF-8')).hexdigest()
if hash_title in products_set:
continue
products_set.add(hash_title)
# product_price_1 = product_ids[i](class_='cube-goods-card__center-left')[0].text
product_price = product_ids[i](class_='cube-goods-card__center-left')[0](
class_='cube-price --sale --icon-size-m --size-m --color-red --weight-medium --decoration-')[0].text
try:
product_price_2 = product_ids[i](class_='cube-goods-card__center-left')[0](
class_='cube-price --sale --icon-size-xs --size-xs --color-grey --weight-medium --decoration-line-through')[
0].text
except Exception as e:
product_price_2 = ''
url_i = 'https://pages.xiaohongshu.com/goods/'
url_o = '?xhs_g_s=0094&banner_id=5aa74c7e6d0bd31b991b48c3&xhs_channel=0090_0090_0066_0094&naviHidden=yes&openPage=yes'
product_info = \
{
'product_img_url': product_img_url,
'product_url': url_i + product_id + url_o,
'product_title': product_title_2,
'product_title_1': product_title_1,
'product_price_2': product_price_2,
'product_price': product_price
}
product_infos.append(product_info)
except Exception as e:
print(i)
print(e)
continue
time.sleep(1)
#j = j+ 1
product_with_shop_names = []
for product in product_infos:
soup = get_html(product['product_url'])
result = re.search('\{\"Main.*?(.*?).*?\}\}\}', soup)
result = json.loads(result.group())
result_shop_name = result['Main']['basicData']['items'][0]['seller']['name']
product['shop_name'] = result_shop_name
product_with_shop_names.append(product)
pickle.dump(product_with_shop_names, open( 'product_with_shop_names.txt', 'wb'))
# import urllib.request
# it_goods_i_id = 0
# img_save_path = './xiaoHong/img/'
# mkdir(img_save_path)
# product_list = []
# for product in product_with_shop_names:
# product['product_img_url_path'] = img_save_path + str(it_goods_i_id) + '.jpg'
# product_list.append(product)
# it_goods_i_id = it_goods_i_id + 1
download_type_number =\
{
'product_women_bag_infos':0,
'product_men_bag_infos':0,
'product_bag_infos':0,
'product_shoes_infos':0,
'product_accessories_infos':0
}
def secoo_write_excel(product_infos,download_type, book):
# '''
# 读取变量
# it_goods_list = product_infos
sheet = book.add_worksheet(download_type)
# 设置sheet表单元格列宽
sheet.set_column("A:A", 5) # 寺库
sheet.set_column("B:B", 112.88) # 商品名称
sheet.set_column("C:C", 10.5) # 正面图
sheet.set_column("D:D", 22.38) # 货号
sheet.set_column("E:E", 22.38) # 货号
sheet.set_column("F:F", 191) # 零售价
sheet.set_column("G:G", 32) # 零售价
sheet.set_column("H:H", 22) # 类型
sheet.set_column("I:I", 22) # 类型
# sheet.set_column("F:F", 82) # 商品链接
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, 'RED', cell_format)
sheet.write(0, 1, '商品名称', cell_format)
sheet.write(0, 2, '正面图', cell_format)
sheet.write(0, 3, '优惠价', cell_format)
sheet.write(0, 4, '原售价', cell_format)
sheet.write(0, 5, '商品链接', cell_format)
sheet.write(0, 6, '一级标题', cell_format)
sheet.write(0, 7, '类型', cell_format)
sheet.write(0, 8, '店铺', cell_format)
img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.2, # 缩放比例
'y_scale': 0.19}
# 插入爬取it_goods_list信息
row_number = 1
for product in product_infos:
if '新草' in product['product_title']:
continue
if '香水' in product['product_title']:
continue
if '太阳' in product['product_title']:
continue
if '镜' in product['product_title']:
continue
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 1, product['product_title'], cell_format)
sheet.insert_image(row_number, 2, product['product_img_url_path'], img_format)
# sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode']
sheet.write(row_number, 3, str(product['product_price']), cell_format)
sheet.write(row_number, 4, str(product['product_price_2']), cell_format)
sheet.write(row_number, 5, product['product_url'], cell_format)
sheet.write(row_number, 6, product['product_title_1'], cell_format)
sheet.write(row_number, 8, product['shop_name'], cell_format)
style = '衣服'
if '包' in product['product_title']:
if '女' in product['product_title']:
style = '女包'
else:
style = '男包'
if '鞋' in product['product_title']:
if '女' in product['product_title']:
style = '女鞋'
else:
style = '男鞋'
if '腰带' in product['product_title']:
if '女' in product['product_title']:
style = '女腰带'
else:
style = '男腰带'
if '围巾' in product['product_title']:
if '女' in product['product_title']:
style = '女围巾'
else:
style = '男围巾'
sheet.write(row_number, 7, style, cell_format)
row_number = row_number + 1
download_type_number[download_type] = row_number
print(download_type + ': 写入EXCEL成功')
return book
def secoo_write_excel_to_path(product_infos,excel_name):
book = xlsxwriter.Workbook(excel_name)
book = secoo_write_excel(product_infos,'all', book)
book.close()
print(excel_name + ': 写入EXCEL成功')
return None
class DownImgClass:
"""一个简单的类实例"""
def __init__(self):
self.imgurl_all_list = []
def downUrl(self, url, path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name + ": success")
return None
def multi_download_img(self):
# '''
self.imgurl_all_list = []
# 读取变量
product_info_list = pickle.load(open('./xiaoHong/product_with_shop_names.txt', 'rb'))
print('read')
it_goods_i_id = 0
img_save_path = './xiaoHong/img/'
mkdir(img_save_path)
self.product_all_list = []
for product in product_info_list:
product['product_img_url_path'] = img_save_path + str(it_goods_i_id) + '.jpg'
img_path = img_save_path + str(it_goods_i_id) + '.jpg'
it_goods_i_id = it_goods_i_id + 1
img_url = product['product_img_url']
if img_url == '':
continue
# img_path = product['product_img_url_path']
img_url_dict = {'img_url': img_url,
'img_path': img_path}
self.imgurl_all_list.append(img_url_dict)
self.product_all_list.append(product)
pickle.dump(self.product_all_list, open('./xiaoHong/red_product_with_image.txt', 'wb'))
print(len(self.imgurl_all_list))
print('down2')
pool = Pool()
pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))])
pool.close()
pool.join()
print('down1')
def second_multi_download_img(self, number):
print(self.imgurl_all_list[number])
img_url = self.imgurl_all_list[number]['img_url']
url_path = self.imgurl_all_list[number]['img_path']
print('down')
try:
self.downUrl(img_url, url_path)
except Exception as e:
print(e)
if __name__=='__main__':
a = time.time()
# 获取信息
c = time.time()
d = a - c
# 下载图片
down_img = DownImgClass()
down_img.multi_download_img()
b = time.time()
# 输出总耗时
print((b - a) / 60)
print(d / 60)
download_style = 'xiaohongshu'
path_time = time.strftime("%m_%d")
work_path = './gucci/' + path_time + '/' + download_style + '/'
save_path = './gucci/' + path_time + '/'
mkdir(work_path)
# 获取信息
product_info_list = pickle.load(open('./xiaoHong/red_product_with_image.txt', 'rb'))
secoo_write_excel_to_path(product_info_list, save_path + 'red_pages' + path_time + '.xlsx')
</code></pre>