# import requests,re,json# # 定義一個函數用來請求噹噹網的網頁信息# def request_dangdang(url):# try:# # 使用get請求# response = requests.get(url)# # 判斷返回的狀態碼是否為200# if respo ...
# import requests,re,json
# # 定義一個函數用來請求噹噹網的網頁信息
# def request_dangdang(url):
# try:
# # 使用get請求
# response = requests.get(url)
# # 判斷返回的狀態碼是否為200
# if response.status_code == 200:
# # 返回接受的文本
# return response.text
# # 若訪問出現錯誤,就返回空
# except requests.RequestException:
# return None
# # 定義函數用來解析訪問噹噹網收到的文本文件
# def parse_text(html):
# # 將正則表達式匹配符封裝,以便多次使用
# pattern = re.compile(
# # 用正則表達式解析出網頁上我們需要的書本名稱信息
# '<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',
# re.S
# )
# # 找出所有的書本信息
# results = re.findall(pattern,html)
# for result in results:
# # 用生成器的方式生成數據
# yield{
# "range":results[0],
# "image":results[1],
# "title":results[2],
# "recommend":results[3],
# "author":results[4],
# "times":results[5],
# "price":results[6]
# }
# # 將解析到的數據寫入文件中
# def write_to_file(result):
# print("準備開始,寫入數據 ====>" + str(result))
# with open("book.txt",'a',encoding = "utf-8") as f:
# f.write(json.dumps(result,ensure_ascii=False) + '\n')
# # 主函數
# def main(page):
# url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
# # 獲取噹噹網的頁面
# html = request_dangdang(url)
# print("獲取網頁成功")
# # 解析出我們想要的信息
# results = parse_text(html)
# print("解析信息成功")
# # 然後將信息依次寫入
# for result in results:
# write_to_file(result)
# print("寫入信息成功")
#
# if __name__ == "__main__":
# for index in range(1,26):
# print(index)
# main(index)
import requests
import re
import json
def request_dandan(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def parse_result(html):
print("正則表達式")
pattern = re.compile(
# '<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span>\sclass="price_n">¥(.*?)</span>.*?</li>',
'<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span class="price_n">.yen;(.*?)</span>.*?</li>',
re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'range': item[0],
'iamge': item[1],
'title': item[2],
'recommend': item[3],
'author': item[4],
'times': item[5],
'price': item[6]
}
def write_item_to_file(item):
print('開始寫入數據 ====> ' + str(item))
with open('book.txt', 'a', encoding='UTF-8') as f:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
def main(page):
url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
html = request_dandan(url)
# print(html)
# print("請求網頁成功")
items = parse_result(html) # 解析過濾我們想要的信息
# print("解析網頁成功")
for item in items:
# print("開始寫入數據")
write_item_to_file(item)
if __name__ == "__main__":
# 迴圈爬取26頁
for i in range(1,26):
main(i)