4、jd_with_image
<pre><code>
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import hashlib
import os
import pickle
import random
import xlsxwriter
import urllib.request
from multiprocessing import Pool
import socket
socket.setdefaulttimeout(5)
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def function(date):
return date['product_price']
def openUrl(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read()
print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
style_list = [
'男钱包', '女钱包', '男包','女包', '男鞋', '女鞋','男装', '女装', '男腰带','女腰带', '男围巾', '女围巾'
]
it_goods_i_id = 0
work_path = './jingdong/'
def makdir_img_path():
for download_type in style_list:
it_goods_i_id = 0
product_info_list = []
img_save_path = './jingdong/'+ download_type + '/img/'
mkdir(img_save_path)
jd_list = pickle.load(open('./jingdong/'+ download_type + '.txt', 'rb'))
for product in jd_list:
product_title = product['product_title']
product_img_url = product['product_img_url']
product_url = product['product_url']
product_plust_price = product['product_plust_price']
product_activity_info = product['product_activity_info']
product_commit = product['product_commit']
product_shop_names = product['product_shop_names']
product_price = product['product_price']
product_img_url_path = img_save_path + str(it_goods_i_id) + '.jpg'
it_goods_i_id = it_goods_i_id + 1
product_info = \
{
'product_img_url': product_img_url,
'product_url': product_url,
'product_title': product_title,
'product_plust_price': product_plust_price,
'product_activity_info': product_activity_info,
'product_commit': product_commit,
'product_shop_names': product_shop_names,
'product_img_url_path': product_img_url_path,
'product_price': product_price
}
print(product_img_url_path)
print('******************')
product_info_list.append(product_info)
# 以价格排序
product_info_list.sort(key=function)
pickle.dump(product_info_list, open(work_path + download_type + '/product_info_list.txt', 'wb'))
def get_img_path():
for download_type in style_list:
it_goods_i_id = 0
product_info_list = []
img_save_path = './jingdong/'+ download_type + '/img/'
mkdir(img_save_path)
jd_list = pickle.load(open('./jingdong/'+ download_type + '.txt', 'rb'))
for product in jd_list:
print('start')
product_url = product['product_url']
product_title = product['product_title']
product_img_url = product['product_img_url']
if product_img_url =='':
try:
Soup = openUrl(product_url)
except Exception as e:
print(e)
continue
# print(Soup)
try:
product_imgs = Soup.find_all(class_="jqzoom main-img")
# print(product_imgs)
img_url_1 = product_imgs[0].img['data-origin']
# img_url_2 = product_imgs[0].img['data-url']
except Exception as e:
print(e)
continue
print('************************************')
print(img_url_1)
# print(img_url_2)
print('************************************')
product_img_url = 'https:' + img_url_1
product_plust_price = product['product_plust_price']
product_activity_info = product['product_activity_info']
product_commit = product['product_commit']
product_shop_names = product['product_shop_names']
product_price = product['product_price']
product_img_url_path = img_save_path + str(it_goods_i_id) + '.jpg'
it_goods_i_id = it_goods_i_id + 1
product_info = \
{
'product_img_url': product_img_url,
'product_url': product_url,
'product_title': product_title,
'product_plust_price': product_plust_price,
'product_activity_info': product_activity_info,
'product_commit': product_commit,
'product_shop_names': product_shop_names,
'product_img_url_path': product_img_url_path,
'product_price': product_price
}
print(product_img_url_path)
print('******************')
product_info_list.append(product_info)
# 以价格排序
product_info_list.sort(key=function)
pickle.dump(product_info_list, open(work_path + download_type + '/product_info_list.txt', 'wb'))
class DownImgClass:
"""一个简单的类实例"""
def __init__(self):
self.imgurl_all_list = []
def downUrl(self, url, path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name + ": success")
return None
def start_download(self):
for i in range(len(style_list)):
self.multi_download_img(style_list[i])
def multi_download_img(self, download_type):
# '''
self.imgurl_all_list = []
# 读取变量
product_info_list = pickle.load(open(work_path + download_type + '/product_info_list.txt', 'rb'))
print('read')
for product in product_info_list:
img_url = product['product_img_url']
if img_url == '':
continue
img_path = product['product_img_url_path']
img_url_dict = {'img_url': img_url,
'img_path': img_path}
self.imgurl_all_list.append(img_url_dict)
print(len(self.imgurl_all_list))
print('down2')
pool = Pool()
pool.map(self.second_multi_download_img, [i for i in range(len(self.imgurl_all_list))])
pool.close()
pool.join()
print('down1')
def second_multi_download_img(self, number):
print(self.imgurl_all_list[number])
img_url = self.imgurl_all_list[number]['img_url']
url_path = self.imgurl_all_list[number]['img_path']
print('down')
try:
self.downUrl(img_url, url_path)
except Exception as e:
print(e)
if __name__ == "__main__":
# get_img_path()
# makdir_img_path()
a = time.time()
# 获取信息
c = time.time()
d = a - c
# 下载图片
down_img = DownImgClass()
down_img.start_download()
b = time.time()
# 输出总耗时
print((b - a) / 60)
print(d / 60)
</code></pre>