15、爬网站信息、保存在excel
<p>爬取信息,存至excel</p>
<pre><code>from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import ssl
import time
import random
import xlsxwriter
ssl._create_default_https_context = ssl._create_unverified_context
# 返回html的soup解析
def openUrl(url):
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req) #请求
html = response.read().decode("utf-8")
Soup = BeautifulSoup(html, 'lxml')
return Soup
# 下载图片到本地
def downUrl(url,path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name+": success")
return None
# 正则化str
def re_str(str):
return str.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
# 计算欧元 此处汇率为7.8737
def rmb2eur(goods_price):
rmb_price = goods_price.replace('¥', '').replace(',', '')
EUR_price = (int(rmb_price) - 200) / 7.8737
eur_price = int(EUR_price)
return eur_price
def html_to_infor(url):
# -------解析html---------
Soup = openUrl(url)
goods_price = Soup.find_all(class_="goods-price")
product_name = Soup.find_all(class_='spice-product-name')
item_title = Soup.find_all(class_="spice-style-number-title")
img = Soup.find_all(class_="spice-standard-image")
not_find = Soup.find_all(class_="spice-payment-img-content")
# -------提取信息---------
if len(product_name) > 0:
# 找到商品
product_name = re_str(product_name[0].text)
goods_price = re_str(goods_price[0].text)
item_title = re_str(item_title[0].text)
imgs = re_str(str(img[0].img["spice-data-image-src"]))#re_str(img[0])
imgs_list = imgs.split(",")
img2_list = imgs_list[2].split(":")
img_url = 'https:' + img2_list[2]
img_url = img_url[0:len(img_url)-1]
return product_name,goods_price,item_title,img_url
else:
# 找不到
return 0, 0, 0, 0
# 从excel表格读取所有要下载的Item No.
# df = pd.DataFrame(pd.read_excel('./file.xls',usecols=[1, 3],header=0))
# lables = df['Item No.']
# item_ids = []
# for i in range(0,16):
# item_ids.append(lables[i])
# -----------------------------------
# 或者直接使用list中的
item_ids = ['400249KU23N8487', '474135K5RLN1095', '5410610U14X5879', '523293K5RMN9769', '443496DRW3T6433', '448075CWLMT9090', '431665D4ZIX1060', '523599K5RLN1095', '4195849HXAT8666']
# -------url---------
url_i = 'https://www.gucci.cn/zh/pr/'
url_o = '?listName=SearchResultGridComponent&position=1&categoryPath='
success_number = 0
fail_number = 0
fail_list = []
product_name_list = []
rmb_list = []
item_list = []
# 遍历提取所有需要爬的信息
for item_id in item_ids:
url = url_i + item_id + url_o
product_name, goods_price, item_title, img_url='','','',''
try:
product_name, goods_price, item_title, img_url = html_to_infor(url)
if product_name == 0:
fail_list.append(item_id)
fail_number = fail_number + 1
print(url)
print('未找到编号:'+item_id)
else:
print('-----------------------')
print(product_name)
print(goods_price)
print(rmb2eur(goods_price))
print(item_title)
product_name_list.append(product_name)
rmb_list.append(goods_price)
item_list.append(item_id)
# print(img_url)
item_img_name = item_title[2:len(item_title)]
downUrl(img_url,'./'+item_img_name+'.jpg')
success_number = success_number+1
print('**********************')
except Exception as e:
fail_list.append(item_id)
fail_number = fail_number + 1
print(e)
continue
time.sleep(random.randint(1, 3))
print('共计'+str(len(item_ids)))
print('成功'+str(success_number))
print('失败'+str(fail_number))
print('失败的编号:'+str(fail_list))
# -------写入EXCEL---------
book = xlsxwriter.Workbook('product_infor.xlsx')
sheet = book.add_worksheet('Sheet1')
sheet_fail = book.add_worksheet('Fail')
# 设置sheet表单元格列宽
sheet.set_column("A:A", 21.88) # 设定A列列宽为21.88
sheet.set_column("B:B", 10.5)
sheet.set_column("C:C", 19.38)
sheet.set_column("D:D", 8.25)
sheet.set_column("E:E", 10.75)
# 设定整个sheet表的单元格的格式
property = {
'font_size': 11, # 字体大小
'bold': False, # 是否加粗
'align': 'center', # 水平对齐方式 left
'valign': 'vcenter', # 垂直对齐方式
'font_name': u'微软雅黑',
'text_wrap': False, # 是否自动换行
}
cell_format = book.add_format(property)
# 设置sheet表单元格行高
sheet.set_row(0, 22) # 设置第一行的高度为22
# 在向单元格中写入内容时,加上单元格样式
# 插入第一行
sheet.write(0, 0, 'Product Name', cell_format)
sheet.write(0, 1, 'Show', cell_format)
sheet.write(0, 2, 'Item No.', cell_format)
sheet.write(0, 3, 'EUR RP', cell_format)
sheet.write(0, 4, 'Offer Price', cell_format)
img_format = {'x_offset':4, # 左右移动
'y_offset':0,
'x_scale':0.2, # 缩放比例
'y_scale':0.19}
# 插入爬取信息
row_number = 1
for (product_name,rmb,item_id) in zip(product_name_list,rmb_list,item_list):
sheet.set_row(row_number, 52) # 设置第row_number行的高度为52
sheet.write(row_number, 0, product_name, cell_format)
sheet.insert_image(row_number,1, './'+item_id+'.jpg' ,img_format)
sheet.write(row_number, 2, item_id, cell_format)
sheet.write(row_number, 3, rmb2eur(rmb), cell_format)
sheet.write(row_number, 4, rmb, cell_format)
row_number = row_number + 1
# 插入失败信息
sheet_fail.set_column("A:A", 19.38)
sheet_fail.write(0, 0, 'Item No.', cell_format)
row_number = 1
for item_id in fail_list:
sheet_fail.set_row(row_number, 26)
sheet_fail.write(row_number, 0, item_id, cell_format)
row_number = row_number + 1
book.close()
print('写入EXCEL成功')
</code></pre>