11、XiaoHongShu_all
<pre><code>from selenium import webdriver
import time
from bs4 import BeautifulSoup
import xlsxwriter
import pickle
browser = webdriver.Chrome()
browser.get('https://www.xiaohongshu.com/page/brands/5a43848e8000862471d15040?'
'openPage=yes&xhs_g_s=0066&banner_id=5aa74c7e6d0bd31b991b48c3&xhs_'
'channel=0090_0090_0090_0066&naviHidden=yes&tab=goods&goods_id=5965c'
'1a170e75226a4192989&_at=52f170df4d108064a431fccee7464b20fa454')
page_source = ''
# 滑动页面,直到所有信息加载完毕
i = 300
for j in range(1,330):
if j < 80:
js="var q=document.documentElement.scrollTop="+str(i)
browser.execute_script(js)
i = i+800
time.sleep(3)
#j = j+ 1
else:
page_source = browser.page_source
pickle.dump(page_source, open('red_page_source.txt', 'wb'))
product_infos = []
red_soup = BeautifulSoup(page_source, 'lxml')
# product_titles = red_soup.find_all(class_='cube-goods-card__top')
product_ids = red_soup.find_all(class_='good cube-goods-card')
for i in range(0, len(product_ids)):
try:
product_img_url = \
product_ids[i](class_='cube-item-image-container cube-goods-card__img cube-image normal-image')[0].img['src']
product_id = product_ids[i]['data-id'] # 1691
product_title_1 = product_ids[i](class_='cube-goods-card__top')[0].h4.text # 1697
product_title_2 = product_ids[i](class_='cube-goods-card__top')[0].span.text
# product_price_1 = product_ids[i](class_='cube-goods-card__center-left')[0].text
product_price = product_ids[i](class_='cube-goods-card__center-left')[0](
class_='cube-price --sale --icon-size-m --size-m --color-red --weight-medium --decoration-')[0].text
try:
product_price_2 = product_ids[i](class_='cube-goods-card__center-left')[0](
class_='cube-price --sale --icon-size-xs --size-xs --color-grey --weight-medium --decoration-line-through')[
0].text
except Exception as e:
product_price_2 = ''
url_i = 'https://pages.xiaohongshu.com/goods/'
url_o = '?xhs_g_s=0094&banner_id=5aa74c7e6d0bd31b991b48c3&xhs_channel=0090_0090_0066_0094&naviHidden=yes&openPage=yes'
product_info = \
{
'product_img_url': product_img_url,
'product_url': url_i + product_id + url_o,
'product_title': product_title_2,
'product_title_1': product_title_1,
'product_price_2': product_price_2,
'product_price': product_price
}
product_infos.append(product_info)
except Exception as e:
print(i)
print(e)
continue
download_type_number =\
{
'product_women_bag_infos':0,
'product_men_bag_infos':0,
'product_bag_infos':0,
'product_shoes_infos':0,
'product_accessories_infos':0
}
def secoo_write_excel(download_type, book):
# '''
# 读取变量
it_goods_list = product_infos
sheet = book.add_worksheet(download_type)
# 设置sheet表单元格列宽
sheet.set_column("A:A", 5) # 寺库
sheet.set_column("B:B", 112.88) # 商品名称
sheet.set_column("C:C", 10.5) # 正面图
sheet.set_column("D:D", 22.38) # 货号
sheet.set_column("E:E", 22.38) # 货号
sheet.set_column("F:F", 191) # 零售价
sheet.set_column("G:G", 32) # 零售价
sheet.set_column("H:H", 22) # 类型
# sheet.set_column("F:F", 82) # 商品链接
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, 'RED', cell_format)
sheet.write(0, 1, '商品名称', cell_format)
sheet.write(0, 2, '正面图', cell_format)
sheet.write(0, 3, '优惠价', cell_format)
sheet.write(0, 4, '原售价', cell_format)
sheet.write(0, 5, '商品链接', cell_format)
sheet.write(0, 6, '一级标题', cell_format)
sheet.write(0, 7, '类型', cell_format)
img_format = {'x_offset': 4, # 左右移动
'y_offset': 0,
'x_scale': 0.2, # 缩放比例
'y_scale': 0.19}
# 插入爬取it_goods_list信息
row_number = 1
for product in it_goods_list:
if '新草' in product['product_title']:
continue
if '香水' in product['product_title']:
continue
if '太阳' in product['product_title']:
continue
if '镜' in product['product_title']:
continue
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 1, product['product_title'], cell_format)
# sheet.insert_image(row_number, 2, product['url_path'], img_format)
# sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode']
sheet.write(row_number, 3, str(product['product_price']), cell_format)
sheet.write(row_number, 4, str(product['product_price_2']), cell_format)
sheet.write(row_number, 5, product['product_url'], cell_format)
sheet.write(row_number, 6, product['product_title_1'], cell_format)
style = '衣服'
if '包' in product['product_title']:
if '女' in product['product_title']:
style = '女包'
else:
style = '男包'
if '鞋' in product['product_title']:
if '女' in product['product_title']:
style = '女鞋'
else:
style = '男鞋'
if '腰带' in product['product_title']:
if '女' in product['product_title']:
style = '女腰带'
else:
style = '男腰带'
if '围巾' in product['product_title']:
if '女' in product['product_title']:
style = '女围巾'
else:
style = '男围巾'
sheet.write(row_number, 7, style, cell_format)
row_number = row_number + 1
download_type_number[download_type] = row_number
print(download_type + ': 写入EXCEL成功')
return book
def secoo_write_excel_to_path(excel_name):
book = xlsxwriter.Workbook(excel_name)
book = secoo_write_excel('all', book)
book.close()
print(excel_name + ': 写入EXCEL成功')
return None
secoo_write_excel_to_path('red_pages-27-2.xlsx')
</code></pre>