import urllib import time ##讀取指定的網址 url = [] page = 1 while page <= 11: url_con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1193111400_0_' ...
import urllib import time ##讀取指定的網址 url = [] page = 1 while page <= 11: url_con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1193111400_0_'+str(page)+'.html').read() print 'con' ,url_con i = 0 title = url_con.find(r'<a title=') print "title",title href = url_con.find(r'href=',title) print "href",href html = url_con.find(r'.html',href) print "html",html while title != -1 and href != -1 and html != -1 and i < 40: url.append(url_con[href+6:html+5]) print page,url[i] title = url_con.find(r'<a title=',html) href = url_con.find(r'href=',title) html = url_con.find(r'.html',href) filename = url[-26:] i = i + 1 else: print page, 'find end' page = page + 1 else: print 'all find end !' j = 0 k = len(url) print "url sum:",k while j < k: content = urllib.urlopen(url[j]).read() filename = url[j][-26:] open(r'blog/'+ filename,'w').write(content) j = j + 1 time.sleep(5)View Code
以上代碼是獲取所有博客文章列表,並讀取其內容,並輸出html