python


1、test(一)

<pre><code>import requests import urllib.request from lxml import html def saveHtml(file_name, file_content): # 注意windows文件命名的禁用符,比如 / with open(file_name.replace('/', '_') + ".html", "wb") as f: # 写文件用bytes而不是str,所以要转码 f.write(file_content) def getHtml(url): page = requests.Session().get(url) # tree = html.fromstring(page.text) tree = page.content #result = tree.xpath('//body//text()') print(page.content) return tree #.encode(encoding="utf-8")#decode('gbk').encode(encoding="utf-8") #.decode('gbk').encode(encoding="utf-8") url='http://rmrb.zhouenlai.info/●●●●/today/rmrbtoday.php/' #需要爬数据的网址 page=requests.Session().get(url) tree=html.fromstring(page.text) result=tree.xpath('//strong//text()') #获取需要的数据 _url='http://rmrb.zhouenlai.info/●●●●/today/' url_='.htm' print (len(result)) print (result[1]) saveHtml (result[1],getHtml(_url+ result[1] + url_)) #for url1 in result: # saveHtml (url1,getHtml(_url+ url1 + url_)) print("ok") ''' aurl = "http://www.view.sdu.edu.cn/info/1003/75240.htm" html = getHtml(aurl) saveHtml("sduview", html) ''' ''' import urllib.request def getHtml(url): html = urllib.request.urlopen(url).read() return html def saveHtml(file_name, file_content): # 注意windows文件命名的禁用符,比如 / with open(file_name.replace('/', '_') + ".html", "wb") as f: # 写文件用bytes而不是str,所以要转码 f.write(file_content) aurl = "http://www.view.sdu.edu.cn/info/1003/75240.htm" html = getHtml(aurl) saveHtml("sduview", html) print("下载成功") from bs4 import BeautifulSoup import requests url = 'https://cn.tripadvisor.com/Attractions-g60763-Activities-New_York_City_New_York.html' wb_date = requests.get(url) soup = BeautifulSoup(wb_date.text,'lxml') print(soup) titles = soup.select('div.listing_title &gt; a[target="_blank"]') imgs = soup.select('img[width="180"]') cates = soup.select('div.p13n_reasoning_v2') #titles = soup.select('#ATTR_ENTRY_ &gt; div.attraction_clarity_cell &gt; div &gt; div &gt; div.listing_info &gt; div.listing_title &gt; a') #print(titles,imgs,cates) for title,img,cate in zip(titles,imgs,cates): data = { 'title':title.get_text(), 'img':img.get('src'), 'cate':list(cate.stripped_strings), } print(data) ''' ''' &lt;a href="/Attraction_Review-g60763-d267031-Reviews-Manhattan_Skyline-New_York_City_New_York.html" onclick="ta.setEvtCookie('Attraction_List_Click', 'POI_click', 'name', 5, '/Attraction_Review')" target="_blank"&gt;曼哈顿天际线&lt;/a&gt; #ATTR_ENTRY_ &gt; div.attraction_clarity_cell &gt; div &gt; div &gt; div.listing_info &gt; div.listing_title &gt; a #ATTR_ENTRY_ &gt; div.attraction_clarity_cell #ATTR_ENTRY_ &gt; div.attraction_clarity_cell &gt; div &gt; div &gt; div.listing_info &gt; div.listing_title &gt; a #ATTR_ENTRY_ &gt; div.attraction_clarity_cell &gt; div &gt; div &gt; div.listing_info &gt; div.listing_title &gt; a #taplc_attraction_coverpage_attraction_0 &gt; div:nth-child(1) &gt; div &gt; div &gt; div.shelf_item_container &gt; div:nth-child(1) &gt; div.poi &gt; div &gt; div.item.name &gt; a '''</code></pre>

页面列表

ITEM_HTML