15、爬网站信息、保存在excel

<p>爬取信息，存至excel</p> <pre><code>from bs4 import BeautifulSoup import urllib.request import pandas as pd import ssl import time import random import xlsxwriter ssl._create_default_https_context = ssl._create_unverified_context # 返回html的soup解析 def openUrl(url): headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req) #请求 html = response.read().decode("utf-8") Soup = BeautifulSoup(html, 'lxml') return Soup # 下载图片到本地 def downUrl(url,path_name): urllib.request.urlretrieve(url, path_name) print(path_name+": success") return None # 正则化str def re_str(str): return str.replace(' ','').replace('\r','').replace('\n','').replace('\t','') # 计算欧元此处汇率为7.8737 def rmb2eur(goods_price): rmb_price = goods_price.replace('￥', '').replace(',', '') EUR_price = (int(rmb_price) - 200) / 7.8737 eur_price = int(EUR_price) return eur_price def html_to_infor(url): # -------解析html--------- Soup = openUrl(url) goods_price = Soup.find_all(class_="goods-price") product_name = Soup.find_all(class_='spice-product-name') item_title = Soup.find_all(class_="spice-style-number-title") img = Soup.find_all(class_="spice-standard-image") not_find = Soup.find_all(class_="spice-payment-img-content") # -------提取信息--------- if len(product_name) > 0: # 找到商品 product_name = re_str(product_name[0].text) goods_price = re_str(goods_price[0].text) item_title = re_str(item_title[0].text) imgs = re_str(str(img[0].img["spice-data-image-src"]))#re_str(img[0]) imgs_list = imgs.split(",") img2_list = imgs_list[2].split(":") img_url = 'https:' + img2_list[2] img_url = img_url[0:len(img_url)-1] return product_name,goods_price,item_title,img_url else: # 找不到 return 0, 0, 0, 0 # 从excel表格读取所有要下载的Item No. # df = pd.DataFrame(pd.read_excel('./file.xls',usecols=[1, 3],header=0)) # lables = df['Item No.'] # item_ids = [] # for i in range(0,16): # item_ids.append(lables[i]) # ----------------------------------- # 或者直接使用list中的 item_ids = ['400249KU23N8487', '474135K5RLN1095', '5410610U14X5879', '523293K5RMN9769', '443496DRW3T6433', '448075CWLMT9090', '431665D4ZIX1060', '523599K5RLN1095', '4195849HXAT8666'] # -------url--------- url_i = 'https://www.gucci.cn/zh/pr/' url_o = '?listName=SearchResultGridComponent&position=1&categoryPath=' success_number = 0 fail_number = 0 fail_list = [] product_name_list = [] rmb_list = [] item_list = [] # 遍历提取所有需要爬的信息 for item_id in item_ids: url = url_i + item_id + url_o product_name, goods_price, item_title, img_url='','','','' try: product_name, goods_price, item_title, img_url = html_to_infor(url) if product_name == 0: fail_list.append(item_id) fail_number = fail_number + 1 print(url) print('未找到编号：'+item_id) else: print('-----------------------') print(product_name) print(goods_price) print(rmb2eur(goods_price)) print(item_title) product_name_list.append(product_name) rmb_list.append(goods_price) item_list.append(item_id) # print(img_url) item_img_name = item_title[2:len(item_title)] downUrl(img_url,'./'+item_img_name+'.jpg') success_number = success_number+1 print('**********************') except Exception as e: fail_list.append(item_id) fail_number = fail_number + 1 print(e) continue time.sleep(random.randint(1, 3)) print('共计'+str(len(item_ids))) print('成功'+str(success_number)) print('失败'+str(fail_number)) print('失败的编号：'+str(fail_list)) # -------写入EXCEL--------- book = xlsxwriter.Workbook('product_infor.xlsx') sheet = book.add_worksheet('Sheet1') sheet_fail = book.add_worksheet('Fail') # 设置sheet表单元格列宽 sheet.set_column("A:A", 21.88) # 设定A列列宽为21.88 sheet.set_column("B:B", 10.5) sheet.set_column("C:C", 19.38) sheet.set_column("D:D", 8.25) sheet.set_column("E:E", 10.75) # 设定整个sheet表的单元格的格式 property = { 'font_size': 11, # 字体大小 'bold': False, # 是否加粗 'align': 'center', # 水平对齐方式 left 'valign': 'vcenter', # 垂直对齐方式 'font_name': u'微软雅黑', 'text_wrap': False, # 是否自动换行 } cell_format = book.add_format(property) # 设置sheet表单元格行高 sheet.set_row(0, 22) # 设置第一行的高度为22 # 在向单元格中写入内容时，加上单元格样式 # 插入第一行 sheet.write(0, 0, 'Product Name', cell_format) sheet.write(0, 1, 'Show', cell_format) sheet.write(0, 2, 'Item No.', cell_format) sheet.write(0, 3, 'EUR RP', cell_format) sheet.write(0, 4, 'Offer Price', cell_format) img_format = {'x_offset':4, # 左右移动 'y_offset':0, 'x_scale':0.2, # 缩放比例 'y_scale':0.19} # 插入爬取信息 row_number = 1 for (product_name,rmb,item_id) in zip(product_name_list,rmb_list,item_list): sheet.set_row(row_number, 52) # 设置第row_number行的高度为52 sheet.write(row_number, 0, product_name, cell_format) sheet.insert_image(row_number,1, './'+item_id+'.jpg' ,img_format) sheet.write(row_number, 2, item_id, cell_format) sheet.write(row_number, 3, rmb2eur(rmb), cell_format) sheet.write(row_number, 4, rmb, cell_format) row_number = row_number + 1 # 插入失败信息 sheet_fail.set_column("A:A", 19.38) sheet_fail.write(0, 0, 'Item No.', cell_format) row_number = 1 for item_id in fail_list: sheet_fail.set_row(row_number, 26) sheet_fail.write(row_number, 0, item_id, cell_format) row_number = row_number + 1 book.close() print('写入EXCEL成功') </code></pre>

python

15、爬网站信息、保存在excel

页面列表