新建發送郵件類 爬取英語學習資料 比如爬取英語學習鏈接:http://www.hjenglish.com/new/c1020/,將當前頁文章爬取到併發送郵件到指定郵箱: ...
新建發送郵件類
import smtplib
from email.mime.text import MIMEText
from email.header import Header
class SendMail:
def __init__(self):
self.sender = '[email protected]'
self.receivers = ['[email protected]','[email protected]'] # 接收郵件,可設置為你的QQ郵箱或者其他郵箱
self.smtp_server = 'smtp.qq.com'
self.smtp_pwd = 'xx'
self.stmp_port = 25
def sendMessage(self, title, msg):
# 三個參數:第一個為文本內容,第二個 plain 設置文本格式,第三個 utf-8 設置編碼
message = MIMEText(msg, 'plain', 'utf-8')
message['From'] = self.sender
message['To'] = ';'.join(self.receivers)
message['Subject'] = Header(title, 'utf-8')
smtpObj = smtplib.SMTP(self.smtp_server, self.stmp_port)
smtpObj.starttls()
smtpObj.login(self.sender, self.smtp_pwd)
smtpObj.sendmail(self.sender, self.receivers, message.as_string())
print('success')
爬取英語學習資料
比如爬取英語學習鏈接:http://www.hjenglish.com/new/c1020/,將當前頁文章爬取到併發送郵件到指定郵箱:
from bs4 import BeautifulSoup
import time, os
import xlwt
import requests
import datetime
import threading
import schedule
from mymodule.SendMail import *
def getLinks(url):
try:
res = requests.get(url, headers={'Host': 'www.hjenglish.com', 'Referer':'http://www.hjenglish.com/new/cet/', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'})
res.raise_for_status()
page = BeautifulSoup(res.text, 'lxml')
res.close()
links =['http://www.hjenglish.com' + adom['href'] for adom in page.select('.big-link.title-article')]
return links
except Exception as err:
print(err)
def spiderLink(url, lock):
print('當前線程', threading.currentThread().getName())
res = requests.get(url, headers={'Host': 'www.hjenglish.com', 'Referer':'http://www.hjenglish.com/new/cet/', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'})
if res.status_code == 200:
try:
page = BeautifulSoup(res.text, "lxml")
res.close()
titles = [title.getText() for title in page.select('.article-header .title')]
contents = [contentDom.getText() for contentDom in page.select('#J-article-content')]
# print(titles, contents)
num = len(titles)
global total
print(titles, contents)
for i in range(0, num):
lock.acquire()
total = total + 1
lock.release()
print(titles[i], contents[i])
sender.sendMessage(titles[i], contents[i])
except Exception as err:
print(err)
else:
pass
def my_job():
try:
starttime = datetime.datetime.now()
url = 'http://www.hjenglish.com/new/c1020/'
lock = threading.Lock()
spider_links = getLinks(url)
threads = [threading.Thread(target=spiderLink, args=(link, lock)) for link in spider_links]
for thread1 in threads:
thread1.start()
for thread2 in threads:
thread2.join()
endtime = datetime.datetime.now()
print('have spend ', str((endtime - starttime).seconds) + 's')
global total
total = 0
except Exception as err:
print(err)
os._exit(0)
if __name__ == '__main__':
try:
sender = SendMail()
total = 0
my_job()
except Exception as err:
print(err)
os._exit(0)