python


35、下载pdf_提取li标签

<pre><code> # coding: utf-8 # In[1]: from bs4 import BeautifulSoup import urllib.request # import pandas as pd import ssl import time import random import xlsxwriter import re import json import os import pickle import socket import sys # In[2]: # 返回html的soup解析 def openUrl(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) Soup = BeautifulSoup(html, 'lxml') return Soup # In[3]: soup = openUrl("http://www.phontron.com/class/nn4nlp2019/schedule/attention.html") # In[4]: default = soup.find_all(class_='default') # In[12]: Soup = BeautifulSoup(default[0].encode("utf-8"), 'lxml') # In[31]: Soup.select('li')[0].text # In[32]: Soup.select('li')[1].text # In[3]: class PdfName(): def __init__(self,name,url): self.name = name self.url = url def get_name(self): return self.name def get_url(self): return self.url # In[10]: post_link_list = [] for url in post_link: post_link_list.append(url.text) post_link_list.append('http://www.phontron.com' + url['href'] ) # In[4]: nn4nlp2019_url = 'http://www.phontron.com/class/nn4nlp2019/schedule.html' nn4nlp_soup = openUrl(nn4nlp2019_url) post_link = nn4nlp_soup.find_all(class_='post-link') post_link_list = [] for url in post_link: post_link_list.append('http://www.phontron.com' + url['href'] ) # In[13]: pdf_object_list = [] for item_url in post_link_list: if 'http' not in item_url: pdf_object_list.append(item_url) continue item_soup = openUrl(item_url) default = item_soup.find_all(class_='default') try: li_soup = BeautifulSoup(default[0].encode("utf-8"), 'lxml') except Exception as e: print(e) continue for pdf in li_soup.select('li'): # print(pdf) try: pdf_url = pdf.a['href'] pdf_name = pdf.text except Exception as e: pdf_url = "" pdf_name = "" print(pdf) if pdf_url == "" and pdf_name == "": continue pdf_object_list.append(PdfName(pdf_name,pdf_url)) print(item_url) # In[7]: len(pdf_object_list) # In[21]: import pickle pickle.dump(pdf_object_list, open('./pdf_object_list_3.txt', 'wb')) # In[ ]: import pickle pdf_object_list = pickle.load(open('./pdf_object_list_2.txt', 'rb')) for item in pdf_object_list: if isinstance(item,str): print("******************************************************") print(item) print("******************************************************") continue print(item.get_name()) print(item.get_url()) # In[19]: for item in pdf_object_list: if isinstance(item,str): print("******************************************************") print(item) print("******************************************************") continue print(item.get_name()) print(item.get_url()) # In[22]: import urllib.request def downUrl(url,path_name): urllib.request.urlretrieve(url, path_name) print(path_name+": success") return None # In[35]: get_ipython().system('set http_proxy=http://127.0.0.1:1080') get_ipython().system('set https_proxy=http://127.0.0.1:1080') # In[36]: for item in pdf_object_list: if isinstance(item,str): continue if '.pdf' in item.get_url(): item = item.get_url() downUrl(item,item.split('/')[-1]) print(item.split('/')[-1]) # if isinstance(item,str): # print("******************************************************") # print(item) # print("******************************************************") # continue # print(item.get_name()) # print(item.get_url()) # In[23]: url0 = 'https://arxiv.org/pdf/1608.05859.pdf' # In[28]: url0.split('/')[-1:] # In[29]: url0.split('/')[-1] </code></pre>

页面列表

ITEM_HTML