1、test(一)
<pre><code>import requests
import urllib.request
from lxml import html
def saveHtml(file_name, file_content):
# 注意windows文件命名的禁用符,比如 /
with open(file_name.replace('/', '_') + ".html", "wb") as f:
# 写文件用bytes而不是str,所以要转码
f.write(file_content)
def getHtml(url):
page = requests.Session().get(url)
# tree = html.fromstring(page.text)
tree = page.content
#result = tree.xpath('//body//text()')
print(page.content)
return tree
#.encode(encoding="utf-8")#decode('gbk').encode(encoding="utf-8") #.decode('gbk').encode(encoding="utf-8")
url='http://rmrb.zhouenlai.info/●●●●/today/rmrbtoday.php/' #需要爬数据的网址
page=requests.Session().get(url)
tree=html.fromstring(page.text)
result=tree.xpath('//strong//text()') #获取需要的数据
_url='http://rmrb.zhouenlai.info/●●●●/today/'
url_='.htm'
print (len(result))
print (result[1])
saveHtml (result[1],getHtml(_url+ result[1] + url_))
#for url1 in result:
# saveHtml (url1,getHtml(_url+ url1 + url_))
print("ok")
'''
aurl = "http://www.view.sdu.edu.cn/info/1003/75240.htm"
html = getHtml(aurl)
saveHtml("sduview", html)
'''
'''
import urllib.request
def getHtml(url):
html = urllib.request.urlopen(url).read()
return html
def saveHtml(file_name, file_content):
# 注意windows文件命名的禁用符,比如 /
with open(file_name.replace('/', '_') + ".html", "wb") as f:
# 写文件用bytes而不是str,所以要转码
f.write(file_content)
aurl = "http://www.view.sdu.edu.cn/info/1003/75240.htm"
html = getHtml(aurl)
saveHtml("sduview", html)
print("下载成功")
from bs4 import BeautifulSoup
import requests
url = 'https://cn.tripadvisor.com/Attractions-g60763-Activities-New_York_City_New_York.html'
wb_date = requests.get(url)
soup = BeautifulSoup(wb_date.text,'lxml')
print(soup)
titles = soup.select('div.listing_title > a[target="_blank"]')
imgs = soup.select('img[width="180"]')
cates = soup.select('div.p13n_reasoning_v2')
#titles = soup.select('#ATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a')
#print(titles,imgs,cates)
for title,img,cate in zip(titles,imgs,cates):
data = {
'title':title.get_text(),
'img':img.get('src'),
'cate':list(cate.stripped_strings),
}
print(data)
'''
'''
<a href="/Attraction_Review-g60763-d267031-Reviews-Manhattan_Skyline-New_York_City_New_York.html" onclick="ta.setEvtCookie('Attraction_List_Click', 'POI_click', 'name', 5, '/Attraction_Review')" target="_blank">曼哈顿天际线</a>
#ATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a
#ATTR_ENTRY_ > div.attraction_clarity_cell
#ATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a
#ATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a
#taplc_attraction_coverpage_attraction_0 > div:nth-child(1) > div > div > div.shelf_item_container > div:nth-child(1) > div.poi > div > div.item.name > a
'''</code></pre>