參考資料: Python:http://www.runoob.com/python/python-intro.html Python爬蟲系列教程:http://www.cnblogs.com/xin-xin/p/4297852.html 正則表達式:http://www.cnblogs.com/de ...
參考資料:
Python:http://www.runoob.com/python/python-intro.html
Python爬蟲系列教程:http://www.cnblogs.com/xin-xin/p/4297852.html
正則表達式:http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html
本貼目標:
1.對百度貼吧的任意帖子進行抓取
2.指定是否只抓取樓主發帖內容
3.將抓取到的內容分析並保存到文件
4.抓取帖子中出現的美圖
# -*- coding: utf-8 -*- """ Created on Fri Apr 15 11:47:02 2016 @author: wuhan """ import urllib import urllib2 import re import time import os #reload(sys) #sys.setdefaultencoding("utf-8") class Tool: removeImg = re.compile('<img.*?>| {12}') removeAddr = re.compile('<a.*?>|</a>') replaceLine = re.compile('<tr>|<div>|</div>|</p>') replaceTD = re.compile('<td>') replacePara = re.compile('<p.*?>') replaceBR = re.compile('<br><br>|<br>') removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.removeExtraTag, "", x) return x.strip() class BDTB: def __init__(self, baseUrl, seeLZ, floorTag): self.baseURL = baseUrl self.seeLZ = '?see_lz=' + str(seeLZ) self.tool = Tool() self.file = None self.floor = 1 self.defaultTitle = u'百度貼吧' self.floorTag = floorTag def getPage(self, pageNum): try: url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode('utf-8') except urllib2.URLError, e: if hasattr(e, "reason"): print u'百度貼吧鏈接失敗,錯誤原因 :', e.reason return None def getTitle(self, page): pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S) result = re.search(pattern, page) if result: return result.group(1).strip() else: return None def getPageNum(self, page): pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S) result = re.search(pattern, page) if result: return result.group(1).strip() else: return None def getContents(self,page): pattern = re.compile('<div id="post_content.*?>(.*?)</div>', re.S) items = re.findall(pattern, page) contents = [] for item in items: content = "\n" + self.tool.replace(item) + "\n" contents.append(content.encode('utf-8')) return contents def setFileTitle(self, title): if title is not None: self.file = open(title + ".txt" , "w+") else: self.file = open(self.defaultTitle + ".txt" , "w+") def writeData(self, contents): for item in contents: if self.floorTag == '1': floorLine = "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------------------------------------------------------\n" self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNum == None: print "URL已失效,請重試" return try: print "該貼子共有" + str(pageNum) + "頁" for i in range(1, int(pageNum)+1): print "正在寫入第" + str(i) + "頁數據" page = self.getPage(i) contents = self.getContents(page) self.writeData(contents) self.getPicture(page, i) except IOError, e: print "寫入異常,原因" + e.message finally: print "寫入任務完成" def getPicture(self, page, PageNum): reg = r'<img class="BDE_Image".*?src="(.+?.jpg)' imgre = re.compile(reg)#可以把正則表達式編譯成一個正則表達式對象 imglist = re.findall(imgre,page)#讀取html 中包含 imgre(正則表達式)的數據 t = time.localtime(time.time()) foldername = str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday")) picpath = 'E:\\Python\\ImageDownload\\%s' % (foldername) #下載到的本地目錄 if not os.path.exists(picpath): #路徑不存在時創建一個 os.makedirs(picpath) x = 0 for imgurl in imglist: target = picpath+'\\%s_%s.jpg' % (PageNum, x) urllib.urlretrieve(imgurl, target)#直接將遠程數據下載到本地 x+=1 print u"請輸入帖子代號" baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/')) seeLZ = raw_input("是否只獲取樓主發言,是輸入1,否輸入0\n".decode('utf-8').encode('gbk')) floorTag = raw_input("是否寫入樓層信息,是輸入1,否輸入0\n".decode('utf-8').encode('gbk')) bdtb = BDTB(baseURL,seeLZ,floorTag) bdtb.start()