安裝:pip install BeautifulSoup4 下表列出了主要的解析器,以及它們的優缺點:看個人習慣選取自己喜歡的解析方式 1 # 獲取html代碼 2 import requests 3 r = requests.get('http://www.python123.io/ws/demo ...
安裝:pip install BeautifulSoup4
下表列出了主要的解析器,以及它們的優缺點:看個人習慣選取自己喜歡的解析方式
1 # 獲取html代碼 2 import requests 3 r = requests.get('http://www.python123.io/ws/demo.html') 4 demo = r.text 5 from bs4 import BeautifulSoup 6 soup = BeautifulSoup(demo,'html.parser') 7 print(soup.prettify()) #按照標準的縮進格式的結構輸出,代碼如下 8 <html> 9 <head> 10 <title> 11 This is a python demo page 12 </title> 13 </head> 14 <body> 15 <p class="title"> 16 <b> 17 The demo python introduces several python courses. 18 </b> 19 </p> 20 <p class="course"> 21 Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: 22 <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1"> 23 Basic Python 24 </a> 25 and 26 <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2"> 27 Advanced Python 28 </a> 29 . 30 </p> 31 </body> 32 </html>
簡單瀏覽數據化方法的用法
#demo的源代碼
html_d="""
<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
</body></html>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_d,'html.parser')
# 獲取title標簽
print(soup.title)
#獲取文本內容
print(soup.text)
#獲取標簽名稱
print(soup.title.name)
#獲取標簽屬性
print(soup.title.attrs)
#獲取head標簽的子節點
print(soup.p.contents)
print(soup.p.children)
#獲取所有的a標簽
print(soup.find_all('a'))
常用解析方法
#demo的源代碼
html_d="""
<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
</body></html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_d,"lxml")
#p下麵所有的子節點
print(soup.p.contents)
soup.contents[0].name
#children本身沒有子節點,得到一個迭代器,包含p下所有子節點
print(soup.p.children)
for child in enumerate(soup.p.children):
print(child)
#子孫節點p下麵所有的標簽都會出來
print(soup.p.descendants)
for i in enumerate(soup.p.children):
print(i)
# string 下麵有且只有一個子節皆可以取出,如有多個位元組則返回為none
print(soup.title.string)
# strings 如果有多個字元串
for string in soup.strings:
print(repr(string))
#去掉空白
for line in soup.stripped_strings:
print(line)
#獲取a標簽的父節點
print(soup.a.parent)
#找到a標簽的父輩節點
print(soup.a.parents)
#兄弟節點
print(soup.a.next_sibling) #同一個兄弟
print(soup.a.next_sibling) #上一個兄弟
print(soup.a.next_sibling) #下一個兄弟
find_all的用法( name, attrs, recursive, text, **kwargs)
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_d,"lxml")
# name
for tag in soup.find_all(re.compile('b')):
print(tag.name)
#attrs
print(soup.find_all('p','course'))
#keyword
print(soup.find_all(id='link1'))
#recursive
# print(soup.find_all('a',recursive=False))
# string
# print(soup.find_all(string=re.compile('python')))
小案例
import requests
from bs4 import BeautifulSoup
import bs4
#獲取URL裡面信息
def getHtmlText(url):
try:
r= requests.get(url,timeout=30 )
r.encoding=r.apparent_encoding
return r.text
except:
return ""
#提起網頁數據
def fillunivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string])
pass
#列印數據結果
def printUnivList(ulist,num):
# tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}\t{:^10}"
# print(tplt.format('排名', '學校名稱', '省份','總分',chr(12288)))
# for i in range(num):
# u = ulist[i]
# print(tplt.format(u[0], u[1], u[2],u[3],chr(12288)))
print("{:^10}\t{:^6}\t{:^10}\t{:^10}".format('排名', '學校名稱', '地區', '總分'))
for i in range(num):
u = ulist[i]
print("{:^10}\t{:^6}\t{:^10}\t{:^10}".format(u[0], u[1], u[2], u[3]))
return
def main():
unifo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
html = getHtmlText(url)
fillunivList(unifo,html)
printUnivList(unifo,20) #列印前20所
main()