# 抓取內容:商品名稱,商品價格,商品鏈接,店鋪名稱,店鋪鏈接 # 爬取的時候之前返回了多次302,301 但是html網頁還是被爬取下來了 抓取的首頁: start_urls = ['https://list.tmall.com/search_product.htm?spm=a220m.10008 ...
# 抓取內容:商品名稱,商品價格,商品鏈接,店鋪名稱,店鋪鏈接 # 爬取的時候之前返回了多次302,301 但是html網頁還是被爬取下來了 抓取的首頁: start_urls = ['https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.4.4b3df937tMXU1S&cat=50024399&sort=d&style=g&active=1&industryCatId=50024399&theme=663'] 第一步:item.py編寫 相信剛開始的準備工作不需要講了(cmd---scrapy startproject tianmao-----scrapy genspider topgood tmall.com)
1 import scrapy 2 3 4 class TianmaoItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 GOODS_PRICE = scrapy.Field() # 價格 8 GOODS_NAME = scrapy.Field() # 名稱 9 GOODS_URL = scrapy.Field() # 商品鏈接 10 SHOP_NAME = scrapy.Field() # 商店名稱 11 SHOP_URL = scrapy.Field() # 商店鏈接
註釋:主要是介紹從一個網頁獲取商品價格名稱鏈接,再通過鏈接爬取商店名稱鏈接。所以挑了幾個具有代表性的數據。
第二步:編寫spiders
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from tianmao.items import TianmaoItem 4 5 6 class TopgoodSpider(scrapy.Spider): 7 name = 'topgood' 8 allowed_domains = ['list.tmall.com', 'detail.tmall.com'] # 二級功能變數名稱 9 10 start_urls = ['https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.4.4b3df937tMXU1S&cat=50024399&sort=d&style=g&active=1&industryCatId=50024399&theme=663'] 11 12 def parse(self, response): 13 divs = response.xpath("//div[@id='J_ItemList']/div[@class='product item-1111 ']/div") # 有的時候會出現標簽變化的情況,需要自行修改 14 print(divs) 15 16 for div in divs: 17 item = TianmaoItem() 18 # 價格 19 item['GOODS_PRICE'] = div.xpath("p[@class='productPrice']/em/@title")[0].extract() # 序列化該節點為unicode字元串並返回list 20 print(item) 21 # 名稱//*[@id="J_ItemList"]/div[3]/div/div[2]/a[1] 22 item['GOODS_NAME'] = div.xpath("div[@class='productTitle productTitle-spu']/a[1]/@title")[0].extract() 23 print(item) 24 pre_Product_Url = div.xpath("div[@class='productTitle productTitle-spu']/a[1]/@href").extract_first() 25 26 if 'http' not in pre_Product_Url: 27 pre_Product_Url = response.urljoin(pre_Product_Url) 28 29 item['GOODS_URL'] = pre_Product_Url 30 print(item) 31 yield scrapy.Request(url=pre_Product_Url, meta={'item': item}, callback=self.parse_detail,dont_filter=True) 32 33 def parse_detail(self, response): 34 divs = response.xpath("//div[@class='extend']/ul") 35 36 if len(divs) == 0: 37 self.log("Detail Page error --%s"%response.url) 38 39 div = divs[0] 40 item = response.meta['item'] 41 item['SHOP_NAME'] = div.xpath("li[1]/div[1]/a/text()")[0].extract().strip() 42 item['SHOP_URL'] = div.xpath("li[1]/div[1]/a/@href")[0].extract() 43 44 yield item 45 # 要保存為csv的格式 scrapy crawl topgood -o result.csv
第三步:setting.py編寫
BOT_NAME = 'tianmao' SPIDER_MODULES = ['tianmao.spiders'] NEWSPIDER_MODULE = 'tianmao.spiders' LOG_FILE = 'topgood.log' LOG_STDOUT=True DEPTH_LIMIT = 2 # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" # Output .csv FEED_URI = u'goods.csv' FEED_FORMAT = 'CSV' DOWNLOAD_DELAY = 5 # Obey robots.txt rules ROBOTSTXT_OBEY = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', "Cookie":"自己訪問該網站時候的cookie", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" } ITEM_PIPELINES = { 'tianmao.pipelines.TianmaoPipeline': 300, }
註:參數詳解
# BOT_NAME ---項目名稱 # DEFAULT_REQUEST_HEADERS ---預設請求頭 # DEPTH_LIMIT ---爬取網頁的深度,預設零 # ITEM_PIPELINES ---保存項目中啟用的pipeline及其順序的字典。該字典預設為空,值(value)任意。 不過值(value)習慣設定在0-1000範圍內。 # LOG_ENABLED ---是否啟動logging # LOG_ENCODING ---logging的編碼 # LOG_FILE ---日誌文件的文件名 # LOG_LEVEL ---日誌記錄的級別 (CRITICAL、 ERROR、WARNING、INFO、DEBUG) # (關鍵, 錯誤, 警告, 信息,調試) # LOG_STDOUT --- 預設: False # 如果為 True ,進程所有的標準輸出(及錯誤)將會被重定向到log中。 # 例如, 執行 print 'hello' ,其將會在Scrapy log中顯示。 # RANDOMIZE_DOWNLOAD_DELAY ---預設為True,在相同網站獲取數據時隨機暫停 # DOWNLOAD_DELAY 預設為0 # ROBOTSTXT_OBEY 預設False,是否遵守robots.txt策咯 # FEED_FORMAT --- 設置數據保存的形式 # FEED_URI --- 保存數據的路徑和文件名