32、downloadfile
<pre><code>
import urllib.request
def downUrl(url,path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name+": success")
return None
for i in range(24,26):
file_url = "http://rail.eecs.berkeley.edu/deeprlcourse/static/slides/lec-" + str(i) + ".pdf"
downUrl(file_url,'./' + str(i) + ".pdf")
print("http://rail.eecs.berkeley.edu/deeprlcourse/static/slides/lec-" + str(i) + ".pdf")
</code></pre>
<h2>提取文件名</h2>
<pre><code>from bs4 import BeautifulSoup
import urllib.request
# import pandas as pd
import ssl
import time
import random
import xlsxwriter
import re
import json
import os
import pickle
import socket
import sys
# 返回html的soup解析
def openUrl(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
stf_soup = openUrl('https://nlp.stanford.edu/seminar/')
content = stf_soup.find_all(class_='container content')
pdf_list = []
for item in stf_soup.select('tr'):
try:
print(item.select('a')[1]['href'])
pdf_url = item.select('a')[1]['href']
if 'http' in pdf_url:
pdf_list.append(pdf_url)
else:
pdf_list.append('https://nlp.stanford.edu/seminar/' + pdf_url)
except Exception as e:
continue
import urllib.request
def downUrl(url,path_name):
urllib.request.urlretrieve(url, path_name)
print(path_name+": success")
return None
for item in pdf_list:
downUrl(item,item.split('/')[-1])
</code></pre>