將同一個女優的放到一個文件夾,用 防止新建文件夾錯誤,但註釋掉後還能正常運行,有待觀察 ...
爬取後發現有一些女優並沒有新建文件夾存放,居然有些title屬性為空。
最主要的是沒有使用代理,IP被封了。。。。。。。。。
將同一個女優的放到一個文件夾,用
threading.Lock()
防止新建文件夾錯誤,但註釋掉後還能正常運行,有待觀察
from lxml import etree
import requests
import os
import re
from urllib import request
import threading
from queue import Queue
from threading import Lock
base = 'http://nanrenvip.xyz'
headers = {
'User-Agent': '~~~~~~~~~~~~~~~',
'Referer': 'http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-0.html'
}
class Producer(threading.Thread):
"""在女優列表中提取每人的姓名和鏈接,name 用於分文件夾, 最後一直傳遞到img中"""
def __init__(self, pages, women_pages, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
def run(self):
while True:
if self.pages.empty():
break
url = self.pages.get()
self.get_women(url)
def get_women(self, url):
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
box = html.xpath('//div[@class="list_box"]//div[@class="list_l"]//li')[:15]
for each in box:
name = each.xpath('./a/@title')[0]
her_url = each.xpath('./a/@href')[0]
women = {'name': name, 'url': base + her_url}
self.women_pages.put(women)
class Producer_2(threading.Thread):
"""獲取女優詳情頁所有番號鏈接以及標題, 標題用來命名圖片"""
def __init__(self, pages, women_pages, avs, *args, **kwargs):
super(Producer_2, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
def run(self):
while True:
if self.pages.empty() and self.women_pages.empty():
break
women = self.women_pages.get()
self.get_av_list(women)
def get_av_list(self, women):
url = women['url']
name = women['name']
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
lst = html.xpath('//div[@class="zp_list"]')[0]
text = etree.tostring(lst, encoding='utf-8').decode('utf-8')
avs = re.findall(r'<a href="(.*?)">(.*?)</a>', text, re.DOTALL)
for each in avs:
her_url = base + each[0]
her_title = each[1]
av_list = {'url': her_url, 'title': her_title, 'name': name}
self.avs.put(av_list)
class Producer_3(threading.Thread):
"""獲取該番號的圖片src, 加上name title"""
def __init__(self, pages, women_pages, avs, imgs, *args, **kwargs):
super(Producer_3, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
self.imgs = imgs
def run(self):
while True:
if self.pages.empty() and self.women_pages.empty() and self.avs.empty():
break
av = self.avs.get()
self.get_imgs(av)
def get_imgs(self, av):
url = av['url']
name = av['name']
title = av['title']
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
tar = html.xpath('//div[@class="artCon"]')[0]
text = etree.tostring(tar, encoding='utf-8').decode('utf-8')
src = re.findall(r'data-original="(.*?)"', text, re.DOTALL)[0]
src = base + src
img = {}
img['name'] = name
img['title'] = title
img['src'] = src
self.imgs.put(img)
class Consumer(threading.Thread):
"""下載圖片, lock 防止出現新建文件夾時錯誤"""
def __init__(self, pages, women_pages, avs, imgs, lock, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
self.imgs = imgs
self.lock = lock
def run(self):
while True:
if self.pages.empty() and self.women_pages.empty() and self.avs.empty() and self.imgs.empty():
break
img = self.imgs.get()
self.download(img)
def download(self, img):
src = img['src']
name = img['name']
title = img['title']
self.lock.acquire()
if not os.path.exists('vip/'+name):
os.makedirs('vip/'+name)
self.lock.release()
try:
request.urlretrieve(src, './vip/%s/%s.jpg' % (name, title))
except:
print(src)
def main():
base_url = 'http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-{}.html'
pages = Queue(60)
women_pages = Queue(1000)
avs = Queue(100000)
imgs = Queue(100000)
lock = threading.Lock()
for x in range(55):
url = base_url.format(x)
pages.put(url)
for x in range(2):
Producer(pages, women_pages).start()
for x in range(10):
Producer_2(pages, women_pages, avs).start()
for x in range(10):
Producer_3(pages, women_pages, avs, imgs).start()
for x in range(10):
Consumer(pages, women_pages, avs, imgs, lock).start()
if __name__ == '__main__':
main()