python


11、XiaoHongShu_all

<pre><code>from selenium import webdriver import time from bs4 import BeautifulSoup import xlsxwriter import pickle browser = webdriver.Chrome() browser.get('https://www.xiaohongshu.com/page/brands/5a43848e8000862471d15040?' 'openPage=yes&amp;xhs_g_s=0066&amp;banner_id=5aa74c7e6d0bd31b991b48c3&amp;xhs_' 'channel=0090_0090_0090_0066&amp;naviHidden=yes&amp;tab=goods&amp;goods_id=5965c' '1a170e75226a4192989&amp;_at=52f170df4d108064a431fccee7464b20fa454') page_source = '' # 滑动页面,直到所有信息加载完毕 i = 300 for j in range(1,330): if j &lt; 80: js="var q=document.documentElement.scrollTop="+str(i) browser.execute_script(js) i = i+800 time.sleep(3) #j = j+ 1 else: page_source = browser.page_source pickle.dump(page_source, open('red_page_source.txt', 'wb')) product_infos = [] red_soup = BeautifulSoup(page_source, 'lxml') # product_titles = red_soup.find_all(class_='cube-goods-card__top') product_ids = red_soup.find_all(class_='good cube-goods-card') for i in range(0, len(product_ids)): try: product_img_url = \ product_ids[i](class_='cube-item-image-container cube-goods-card__img cube-image normal-image')[0].img['src'] product_id = product_ids[i]['data-id'] # 1691 product_title_1 = product_ids[i](class_='cube-goods-card__top')[0].h4.text # 1697 product_title_2 = product_ids[i](class_='cube-goods-card__top')[0].span.text # product_price_1 = product_ids[i](class_='cube-goods-card__center-left')[0].text product_price = product_ids[i](class_='cube-goods-card__center-left')[0]( class_='cube-price --sale --icon-size-m --size-m --color-red --weight-medium --decoration-')[0].text try: product_price_2 = product_ids[i](class_='cube-goods-card__center-left')[0]( class_='cube-price --sale --icon-size-xs --size-xs --color-grey --weight-medium --decoration-line-through')[ 0].text except Exception as e: product_price_2 = '' url_i = 'https://pages.xiaohongshu.com/goods/' url_o = '?xhs_g_s=0094&amp;banner_id=5aa74c7e6d0bd31b991b48c3&amp;xhs_channel=0090_0090_0066_0094&amp;naviHidden=yes&amp;openPage=yes' product_info = \ { 'product_img_url': product_img_url, 'product_url': url_i + product_id + url_o, 'product_title': product_title_2, 'product_title_1': product_title_1, 'product_price_2': product_price_2, 'product_price': product_price } product_infos.append(product_info) except Exception as e: print(i) print(e) continue download_type_number =\ { 'product_women_bag_infos':0, 'product_men_bag_infos':0, 'product_bag_infos':0, 'product_shoes_infos':0, 'product_accessories_infos':0 } def secoo_write_excel(download_type, book): # ''' # 读取变量 it_goods_list = product_infos sheet = book.add_worksheet(download_type) # 设置sheet表单元格列宽 sheet.set_column("A:A", 5) # 寺库 sheet.set_column("B:B", 112.88) # 商品名称 sheet.set_column("C:C", 10.5) # 正面图 sheet.set_column("D:D", 22.38) # 货号 sheet.set_column("E:E", 22.38) # 货号 sheet.set_column("F:F", 191) # 零售价 sheet.set_column("G:G", 32) # 零售价 sheet.set_column("H:H", 22) # 类型 # sheet.set_column("F:F", 82) # 商品链接 # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) # 设置sheet表单元格行高 sheet.set_row(0, 22) # 设置第一行的高度为22 # 在向单元格中写入内容时,加上单元格样式 # 插入第一行 sheet.write(0, 0, 'RED', cell_format) sheet.write(0, 1, '商品名称', cell_format) sheet.write(0, 2, '正面图', cell_format) sheet.write(0, 3, '优惠价', cell_format) sheet.write(0, 4, '原售价', cell_format) sheet.write(0, 5, '商品链接', cell_format) sheet.write(0, 6, '一级标题', cell_format) sheet.write(0, 7, '类型', cell_format) img_format = {'x_offset': 4, # 左右移动 'y_offset': 0, 'x_scale': 0.2, # 缩放比例 'y_scale': 0.19} # 插入爬取it_goods_list信息 row_number = 1 for product in it_goods_list: if '新草' in product['product_title']: continue if '香水' in product['product_title']: continue if '太阳' in product['product_title']: continue if '镜' in product['product_title']: continue sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 1, product['product_title'], cell_format) # sheet.insert_image(row_number, 2, product['url_path'], img_format) # sheet.write(row_number, 3, product['productCode'], cell_format) # product['productCode'] sheet.write(row_number, 3, str(product['product_price']), cell_format) sheet.write(row_number, 4, str(product['product_price_2']), cell_format) sheet.write(row_number, 5, product['product_url'], cell_format) sheet.write(row_number, 6, product['product_title_1'], cell_format) style = '衣服' if '包' in product['product_title']: if '女' in product['product_title']: style = '女包' else: style = '男包' if '鞋' in product['product_title']: if '女' in product['product_title']: style = '女鞋' else: style = '男鞋' if '腰带' in product['product_title']: if '女' in product['product_title']: style = '女腰带' else: style = '男腰带' if '围巾' in product['product_title']: if '女' in product['product_title']: style = '女围巾' else: style = '男围巾' sheet.write(row_number, 7, style, cell_format) row_number = row_number + 1 download_type_number[download_type] = row_number print(download_type + ': 写入EXCEL成功') return book def secoo_write_excel_to_path(excel_name): book = xlsxwriter.Workbook(excel_name) book = secoo_write_excel('all', book) book.close() print(excel_name + ': 写入EXCEL成功') return None secoo_write_excel_to_path('red_pages-27-2.xlsx') </code></pre>

页面列表

ITEM_HTML