例題import lxml.html test_data = """ <div> <ul> <li class="item-0"><a href="link1.html" id="places_neighbours__row">9,596,960first item</a></li> <li cla ...
例題
import lxml.html test_data = """ <div> <ul> <li class="item-0"><a href="link1.html" id="places_neighbours__row">9,596,960first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html" id="places_neighbours__row">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> <li class="good-0"><a href="link5.html">fifth item</a></li> </ul> <book> <title lang="aaengbb">111111</title> <price id="places_neighbours__row">29.99</price> </book> <book> <title lang="zh">222222</title> <price>39.95</price> </book> <book> <title>33333</title> <price>40</price> </book> </div> <a> <book> <title>123</title> </book> </a> """ """ / 從根標簽開始 必須具有嚴格的父子關係 // 從當前標簽 後續節點含有即可選出 * 通配符,選擇所有 //div/book[1]/title 選擇div下第一個book標簽的title元素 //div/book/title[@lang="zh"]選擇title屬性含有lang且內容是zh的title元素 //div/book/title //book/title //title //div//title 具有相同的結果,因為使用相對路徑最終都指向title //book/title/@* 將title所有的屬性值選擇出來 //book/title/text() 將title的內容選擇出來,使用內置text()函數 //a[@href="link1.html" and @id="places_neighbours__row"] //a[@href="link1.html" or @id="places_neighbours__row"] //div/book[last()]/title/text() 將最後一個book元素選出 //div/book[price > 39]/title 將book子標簽price數值大於39的選擇出來 //li[starts-with(@class,'item')] 將class屬性首碼是item的li標簽選出 //title[contains(@lang,'eng')] 將title屬性lang含有eng關鍵字的標簽選出 """ html = lxml.html.fromstring(test_data) #html_data = html.xpath('//div/book/title/text()') #html_data = html.xpath('//div/book[1]/title/text()') #html_data = html.xpath('//div/book/title[@lang="zh"]/text()') #html_data = html.xpath('//div/book/title/text()') # html_data = html.xpath('//book/title/text()') # html_data = html.xpath('//title/text()') # html_data = html.xpath('//div//title/text()') # html_data = html.xpath('//book/title/@*') # html_data = html.xpath('//a[@href="link1.html" and @id="places_neighbours__row"]/text()') #html_data = html.xpath('//a[@href="link2.html"]/text()') # html_data = html.xpath('//div/ul/li/a[@id]/text()') # html_data = html.xpath('//a[@href="link1.html" and @id="places_neighbours__row"]/@*') # html_data = html.xpath('//a[@href="link1.html" and @id="places_neighbours__row"]/@href') # html_data = html.xpath('//a[@href="link1.html" or @id="places_neighbours__row"]/text()') # html_data = html.xpath('//div/book[last()]/title/text()') #html_data = html.xpath('//div/book[price > 39]/title/text()') # html_data = html.xpath('//li[starts-with(@class,"item")]/a/text()') html_data = html.xpath('//title[contains(@lang,"eng")]/text()') for i in html_data: print(i)