from threading import Threadfrom concurrent.futures import ThreadPoolExecutorfrom multiprocessing import Process, Queueimport requestsfrom lxml import ...
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import requests
from lxml import etree
from urllib import parse
# 異常處理還未優化,後續補上
# 未解決問題1:這是爬取多個頁面的當前所有圖片,圖片內部的還未處理
# 未解決問題2:當爬取頁面過多時,會報錯,原因還未找到,後續補上
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
# 防盜鏈 : 朔源,當前本次請求的上一級是誰
"Referer": "https://xxx"
}
def get_img_src(q):
urls = []
for i in range(1, 5):
if i == 1:
a = f"https://xxx/index.html"
else:
a = f"https://xxx/{i}.html"
urls.append(a)
href_list_all = []
for i in urls:
resp = requests.get(i, headers=headers)
resp.encoding = 'utf-8'
tree = etree.HTML(resp.text)
href_list = tree.xpath("//div[@class='list-box-p']/ul/li/a/@href")
href_list_all.append(href_list)
for all_list in href_list_all:
for href in all_list:
child_resp = requests.get(href, headers=headers)
child_resp.encoding = 'utf-8'
child_tree = etree.HTML(child_resp.text)
src = child_tree.xpath("//div[@class='img_box']/a/img/@src")[0] # 註意這裡獲取的是列表,需要取裡面的下標為0的第一個元素值
q.put(src) # 迴圈向隊列里裝東西,後面好給下載用
print(f"---------------------------------------------------被塞進隊列--------------------->{src}")
q.put("完事了")
def download(src):
print('開始下載------------>', src)
name = src.split('/')[-1]
with open("./image/" + name, mode='wb') as f:
resp = requests.get(src, headers=headers)
f.write(resp.content)
print('下載完畢------------>', src)
def download_img(q):
with ThreadPoolExecutor(5) as t:
while 1:
src = q.get() # 從隊列里拿東西,如果沒數據就阻塞,一直等著有數據來
if src == "完事了":
break
t.submit(download, src)
if __name__ == '__main__':
q = Queue()
p1 = Process(target=get_img_src, args=(q,))
p2 = Process(target=download_img, args=(q,))
p1.start()
p2.start()