35、下载pdf_提取li标签
<pre><code>
# coding: utf-8
# In[1]:
from bs4 import BeautifulSoup
import urllib.request
# import pandas as pd
import ssl
import time
import random
import xlsxwriter
import re
import json
import os
import pickle
import socket
import sys
# In[2]:
# 返回html的soup解析
def openUrl(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
# In[3]:
soup = openUrl("http://www.phontron.com/class/nn4nlp2019/schedule/attention.html")
# In[4]:
default = soup.find_all(class_='default')
# In[12]:
Soup = BeautifulSoup(default[0].encode("utf-8"), 'lxml')
# In[31]:
Soup.select('li')[0].text
# In[32]:
Soup.select('li')[1].text
# In[3]:
class PdfName():
def __init__(self,name,url):
self.name = name
self.url = url
def get_name(self):
return self.name
def get_url(self):
return self.url
# In[10]:
post_link_list = []
for url in post_link:
post_link_list.append(url.text)
post_link_list.append('http://www.phontron.com' + url['href'] )
# In[4]:
nn4nlp2019_url = 'http://www.phontron.com/class/nn4nlp2019/schedule.html'
nn4nlp_soup = openUrl(nn4nlp2019_url)
post_link = nn4nlp_soup.find_all(class_='post-link')
post_link_list = []
for url in post_link:
post_link_list.append('http://www.phontron.com' + url['href'] )
# In[13]:
pdf_object_list = []
for item_url in post_link_list:
if 'http' not in item_url:
pdf_object_list.append(item_url)
continue
item_soup = openUrl(item_url)
default = item_soup.find_all(class_='default')
try:
li_soup = BeautifulSoup(default[0].encode("utf-8"), 'lxml')
except Exception as e:
print(e)
continue
for pdf in li_soup.select('li'):
# print(pdf)
try:
pdf_url = pdf.a['href']
pdf_name = pdf.text
except Exception as e:
pdf_url = ""
pdf_name = ""
print(pdf)
if pdf_url == "" and pdf_name == "":
continue
pdf_object_list.append(PdfName(pdf_name,pdf_url))
print(item_url)
# In[7]:
len(pdf_object_list)
# In[21]:
import pickle
pickle.dump(pdf_object_list, open('./pdf_object_list_3.txt', 'wb'))
# In[ ]:
import pickle
pdf_object_list = pickle.load(open('./pdf_object_list_2.txt', 'rb'))
for item in pdf_object_list:
if isinstance(item,str):
print("******************************************************")
print(item)
print("******************************************************")
continue
print(item.get_name())
print(item.get_url())
# In[19]:
for item in pdf_object_list:
if isinstance(item,str):
print("******************************************************")
print(item)
print("******************************************************")
continue
print(item.get_name())
print(item.get_url())
# In[22]:
import urllib.request
def downUrl(url,path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name+": success")
return None
# In[35]:
get_ipython().system('set http_proxy=http://127.0.0.1:1080')
get_ipython().system('set https_proxy=http://127.0.0.1:1080')
# In[36]:
for item in pdf_object_list:
if isinstance(item,str):
continue
if '.pdf' in item.get_url():
item = item.get_url()
downUrl(item,item.split('/')[-1])
print(item.split('/')[-1])
# if isinstance(item,str):
# print("******************************************************")
# print(item)
# print("******************************************************")
# continue
# print(item.get_name())
# print(item.get_url())
# In[23]:
url0 = 'https://arxiv.org/pdf/1608.05859.pdf'
# In[28]:
url0.split('/')[-1:]
# In[29]:
url0.split('/')[-1]
</code></pre>