python


32、downloadfile

<pre><code> import urllib.request def downUrl(url,path_name): urllib.request.urlretrieve(url, path_name) print(path_name+": success") return None for i in range(24,26): file_url = "http://rail.eecs.berkeley.edu/deeprlcourse/static/slides/lec-" + str(i) + ".pdf" downUrl(file_url,'./' + str(i) + ".pdf") print("http://rail.eecs.berkeley.edu/deeprlcourse/static/slides/lec-" + str(i) + ".pdf") </code></pre> <h2>提取文件名</h2> <pre><code>from bs4 import BeautifulSoup import urllib.request # import pandas as pd import ssl import time import random import xlsxwriter import re import json import os import pickle import socket import sys # 返回html的soup解析 def openUrl(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) Soup = BeautifulSoup(html, 'lxml') return Soup stf_soup = openUrl('https://nlp.stanford.edu/seminar/') content = stf_soup.find_all(class_='container content') pdf_list = [] for item in stf_soup.select('tr'): try: print(item.select('a')[1]['href']) pdf_url = item.select('a')[1]['href'] if 'http' in pdf_url: pdf_list.append(pdf_url) else: pdf_list.append('https://nlp.stanford.edu/seminar/' + pdf_url) except Exception as e: continue import urllib.request def downUrl(url,path_name): urllib.request.urlretrieve(url, path_name) print(path_name+": success") return None for item in pdf_list: downUrl(item,item.split('/')[-1]) </code></pre>

页面列表

ITEM_HTML