項目:爬取房天下網站全國所有城市的新房和二手房信息 網站url分析 創建項目 sfw_spider.py items.py pipelines.py middleware.py 設置隨機User-Agent settings.py start.py ...
項目:爬取房天下網站全國所有城市的新房和二手房信息
網站url分析
1.獲取所有城市url http://www.fang.com/SoufunFamily.htm 例如:http://cq.fang.com/ 2.新房url http://newhouse.sh.fang.com/house/s/ 3.二手房url http://esf.sh.fang.com/ 4.北京新房和二手房url規則不同 http://newhouse.fang.com/house/s/ http://esf.fang.com/
創建項目
scrapy startproject fang scrapy genspider sfw_spider "fang.com"
sfw_spider.py
# -*- coding: utf-8 -*- import scrapy import re from fang.items import NewHouseItem,ESFHouseItem class SfwSpiderSpider(scrapy.Spider): name = 'sfw_spider' allowed_domains = ['fang.com'] start_urls = ['http://www.fang.com/SoufunFamily.htm'] def parse(self, response): trs = response.xpath("//div[@class='outCont']//tr") provice = None for tr in trs: #排除掉第一個td,兩個第二個和第三個td標簽 tds = tr.xpath(".//td[not(@class)]") provice_td = tds[0] provice_text = provice_td.xpath(".//text()").get() #如果第二個td裡面是空值,則使用上個td的省份的值 provice_text = re.sub(r"\s","",provice_text) if provice_text: provice = provice_text #排除海外城市 if provice == '其它': continue city_td = tds[1] city_links = city_td.xpath(".//a") for city_link in city_links: city = city_link.xpath(".//text()").get() city_url = city_link.xpath(".//@href").get() # print("省份:",provice) # print("城市:",city) # print("城市鏈接:",city_url) #下麵通過獲取的city_url拼接出新房和二手房的url鏈接 #城市url:http://cq.fang.com/ #新房url:http://newhouse.cq.fang.com/house/s/ #二手房:http://esf.cq.fang.com/ url_module = city_url.split("//") scheme = url_module[0] #http: domain = url_module[1] #cq.fang.com/ if 'bj' in domain: newhouse_url = ' http://newhouse.fang.com/house/s/' esf_url = ' http://esf.fang.com/' else: #新房url newhouse_url = scheme + '//' + "newhouse." + domain + "house/s/" #二手房url esf_url = scheme + '//' + "esf." + domain + "house/s/" # print("新房鏈接:",newhouse_url) # print("二手房鏈接:",esf_url) #meta裡面可以攜帶一些參數信息放到Request裡面,在callback函數裡面通過response獲取 yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta = {'info':(provice,city)} ) yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={'info': (provice, city)}) def parse_newhouse(self,response): #新房 provice,city = response.meta.get('info') lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li") for li in lis: name = li.xpath(".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()").get() if name: name = re.sub(r"\s","",name) #居室 house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall() house_type_list = list(map(lambda x:re.sub(r"\s","",x),house_type_list)) rooms = list(filter(lambda x:x.endswith("居"),house_type_list)) #面積 area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall()) area = re.sub(r"\s|-|/","",area) #地址 address = li.xpath(".//div[@class='address']/a/@title").get() address = re.sub(r"[請選擇]","",address) sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get() price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|廣告","",price) #詳情頁url origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get() item = NewHouseItem( name=name, rooms=rooms, area=area, address=address, sale=sale, price=price, origin_url=origin_url, provice=provice, city=city ) yield item #下一頁 next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (provice, city)} ) def parse_esf(self,response): #二手房 provice, city = response.meta.get('info') dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") for dl in dls: item = ESFHouseItem(provice=provice,city=city) name = dl.xpath(".//span[@class='tit_shop']/text()").get() if name: infos = dl.xpath(".//p[@class='tel_shop']/text()").getall() infos = list(map(lambda x:re.sub(r"\s","",x),infos)) for info in infos: if "廳" in info: item["rooms"] = info elif '層' in info: item["floor"] = info elif '向' in info: item['toward'] = info elif '㎡' in info: item['area'] = info elif '年建' in info: item['year'] = re.sub("年建","",info) item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get() #總價 item['price'] = "".join(dl.xpath(".//span[@class='red']//text()").getall()) #單價 item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get() item['name'] = name detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get() item['origin_url'] = response.urljoin(detail) yield item #下一頁 next_url = response.xpath("//div[@class='page_al']/p/a/@href").get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (provice, city)} )
items.py
# -*- coding: utf-8 -*- import scrapy class NewHouseItem(scrapy.Item): #省份 provice = scrapy.Field() # 城市 city = scrapy.Field() # 小區 name = scrapy.Field() # 價格 price = scrapy.Field() # 幾居,是個列表 rooms = scrapy.Field() # 面積 area = scrapy.Field() # 地址 address = scrapy.Field() # 是否在售 sale = scrapy.Field() # 房天下詳情頁面的url origin_url = scrapy.Field() class ESFHouseItem(scrapy.Item): # 省份 provice = scrapy.Field() # 城市 city = scrapy.Field() # 小區名字 name = scrapy.Field() # 幾室幾廳 rooms = scrapy.Field() # 層 floor = scrapy.Field() # 朝向 toward = scrapy.Field() # 年代 year = scrapy.Field() # 地址 address = scrapy.Field() # 建築面積 area = scrapy.Field() # 總價 price = scrapy.Field() # 單價 unit = scrapy.Field() # 詳情頁url origin_url = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- from scrapy.exporters import JsonLinesItemExporter class FangPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json','wb') self.esfhouse_fp = open('esfhouse.json','wb') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False) self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,ensure_ascii=False) def process_item(self, item, spider): self.newhouse_exporter.export_item(item) self.esfhouse_exporter.export_item(item) return item def close_spider(self,spider): self.newhouse_fp.close() self.esfhouse_fp.close()
middleware.py 設置隨機User-Agent
# -*- coding: utf-8 -*- import random class UserAgentDownloadMiddleware(object): USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', ] def process_request(self,request,spider): user_agent = random.choice(self.USER_AGENTS) request.headers['User-Agent'] = user_agent
settings.py
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DOWNLOADER_MIDDLEWARES = { 'fang.middlewares.UserAgentDownloadMiddleware': 543, } ITEM_PIPELINES = { 'fang.pipelines.FangPipeline': 300, }
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl sfw_spider".split())