7.1.糗事百科 安裝 pip install pypiwin32 pip install Twisted-18.7.0-cp36-cp36m-win_amd64.whl pip install scrapy 創建和運行項目 代碼 qsbk_spider.py item.py pipelines.p ...
7.1.糗事百科
安裝
pip install pypiwin32
pip install Twisted-18.7.0-cp36-cp36m-win_amd64.whl
pip install scrapy
創建和運行項目
scrapy startproject qsbk #創建項目 scrapy genspider qsbk_spider "qiushibaike.com" #創建爬蟲 scrapy crawl qsbk_spider #運行爬蟲
代碼
qsbk_spider.py
# -*- coding: utf-8 -*- import scrapy from qsbk.items import QsbkItem class QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/8hr/page/1/'] base_domain = "https://www.qiushibaike.com" def parse(self, response): duanzidivs = response.xpath("//div[@id='content-left']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author,content=content) yield item #爬後面頁的數據 next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain+next_url,callback=self.parse)
item.py
import scrapy class QsbkItem(scrapy.Item): author = scrapy.Field() content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- import json #1.手動把dick轉換成json格式 # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json','w',encoding='utf-8') # # def open_spider(self,spider): # print('開始爬蟲') # # def process_item(self, item, spider): # item_json = json.dumps(dict(item),ensure_ascii=False) # self.fp.write(item_json+'\n') # return item # # def close_spider(self,spider): # self.fp.close() # print('爬蟲結束了') #2.適用JsonItemExporter,使用與數據量小的情況下 # from scrapy.exporters import JsonItemExporter # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json','wb') # self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') # self.exporter.start_exporting() # # def open_spider(self,spider): # print('開始爬蟲') # # def process_item(self, item, spider): # self.exporter.export_item(item) # return item # # def close_spider(self,spider): # self.exporter.finish_exporting() # self.fp.close() # print('爬蟲結束了') #3.JsonLinesItemExporter,適用與數據量大的情況下 from scrapy.exporters import JsonLinesItemExporter class QsbkPipeline(object): def __init__(self): self.fp = open('duanzi.json','wb') self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') def open_spider(self,spider): print('開始爬蟲') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.fp.close() print('爬蟲結束了')
settings.py
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', }
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl qsbk_spider".split())