33、css选择器
<p><a href="http://liuyongqian.com/2018/03/23/beautifulsoup%E6%A8%A1%E5%9D%97/">http://liuyongqian.com/2018/03/23/beautifulsoup%E6%A8%A1%E5%9D%97/</a></p>
<pre><code>
from bs4 import BeautifulSoup
import urllib.request
# import pandas as pd
import ssl
import time
import random
import xlsxwriter
import re
import json
import os
import pickle
import socket
import sys
# 返回html的soup解析
def openUrl(url):
# headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
it_header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'}
req = urllib.request.Request(url, headers=it_header)
response = urllib.request.urlopen(req) # 请求
html = response.read().decode("utf-8")
# print(html)
Soup = BeautifulSoup(html, 'lxml')
return Soup
soup = openUrl("http://www.phontron.com/class/nn4nlp2019/schedule/attention.html")
# 一次解析
default = soup.find_all(class_='default')
# 二次解析 提取li数组 CSS选择器
Soup = BeautifulSoup(default[0].encode("utf-8"), 'lxml')
Soup.select('li')[0]
'''
<li><i>Required Reading (for quiz):</i> <a href="https://arxiv.org/pdf/1703.01619.pdf">Neural Machine Translation and Sequence-to-Sequence Models</a> Chapter 8</li>
'''
for item in Soup.select('li'):
print(item.a['href'])
</code></pre>