python


33、css选择器

<p><a href="http://liuyongqian.com/2018/03/23/beautifulsoup%E6%A8%A1%E5%9D%97/">http://liuyongqian.com/2018/03/23/beautifulsoup%E6%A8%A1%E5%9D%97/</a></p> <pre><code> from bs4 import BeautifulSoup import urllib.request # import pandas as pd import ssl import time import random import xlsxwriter import re import json import os import pickle import socket import sys # 返回html的soup解析 def openUrl(url): # headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} it_header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15'} req = urllib.request.Request(url, headers=it_header) response = urllib.request.urlopen(req) # 请求 html = response.read().decode("utf-8") # print(html) Soup = BeautifulSoup(html, 'lxml') return Soup soup = openUrl("http://www.phontron.com/class/nn4nlp2019/schedule/attention.html") # 一次解析 default = soup.find_all(class_='default') # 二次解析 提取li数组 CSS选择器 Soup = BeautifulSoup(default[0].encode("utf-8"), 'lxml') Soup.select('li')[0] ''' &lt;li&gt;&lt;i&gt;Required Reading (for quiz):&lt;/i&gt; &lt;a href="https://arxiv.org/pdf/1703.01619.pdf"&gt;Neural Machine Translation and Sequence-to-Sequence Models&lt;/a&gt; Chapter 8&lt;/li&gt; ''' for item in Soup.select('li'): print(item.a['href']) </code></pre>

页面列表

ITEM_HTML