請求庫之selenium_ZenDei技術網路在線

###一、介紹 selenium最初是一個自動化測試工具,而爬蟲中使用它主要是為瞭解決requests無法直接執行JavaScript代碼的問題 selenium本質是通過驅動瀏覽器，完全模擬瀏覽器的操作，比如跳轉、輸入、點擊、下拉等，來拿到網頁渲染之後的結果，可支持多種瀏覽器 from selen ...

一、介紹

selenium最初是一個自動化測試工具,而爬蟲中使用它主要是為瞭解決requests無法直接執行JavaScript代碼的問題

selenium本質是通過驅動瀏覽器，完全模擬瀏覽器的操作，比如跳轉、輸入、點擊、下拉等，來拿到網頁渲染之後的結果，可支持多種瀏覽器

from selenium import webdriver

# 得到一個瀏覽器對象，相當於你打開了一個瀏覽器
browser=webdriver.Chrome()
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge()

官網

二、安裝

1、有界面瀏覽器

#安裝：selenium+chromedriver
pip3 install selenium

瀏覽器驅動，鏡像站:http://npm.taobao.org/mirrors/chromedriver/
最新的版本去官網找:https://sites.google.com/a/chromium.org/chromedriver/downloads

驅動要跟瀏覽器版本對應 84.0.4147.105：驅動用84.0.4147.30/ 找比本機低的版本，向下相容
下載--->解壓是不同平臺的可執行文件，windows是exe文件
下載chromdriver.exe放到python安裝路徑的scripts目錄中，因為python路徑已經加到環境變數中了

#驗證安裝
# chromdriver.exe如果沒有加入環境變數，需要指定使用跟那個驅動
# 指定路徑寫絕對路徑，這裡因為放在項目根路徑下，用的相對路徑
from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='./chromedriver.exe')  # 得到一個谷歌瀏覽器對象
time.sleep(2)
driver.get('https://www.baidu.com/')                           # 相當於在瀏覽器地址欄里輸入了百度網址
time.sleep(2)
print(driver.page_source)                                      # 拿到html頁面，如果頁面執行了js，就能拿到js渲染的數據
time.sleep(2)
driver.close()                                                 # 關閉瀏覽器

2、無界面瀏覽器

phantomjs，已經不維護了

# 谷歌瀏覽器支持不打開頁面
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

chrome_options = Options()                                           # 得到一個配置
chrome_options.add_argument('window-size=1920x3000')                 #指定瀏覽器解析度
chrome_options.add_argument('--disable-gpu')                         #谷歌文檔提到需要加上這個屬性來規避bug
chrome_options.add_argument('--hide-scrollbars')                     #隱藏滾動條, 應對一些特殊頁面
chrome_options.add_argument('blink-settings=imagesEnabled=false')    #不載入圖片, 提升速度

# 手動指定使用的瀏覽器位置，Chrome驅動exe放在環境變數 或executable_path指定了，這句就不用寫了
chrome_options.binary_location = r"C:\ProgramFiles(x86)\Google\Chrome\Application\chrome.exe" 

# 瀏覽器不提供可視化頁面. linux下如果系統不支持可視化不加這條會啟動失敗
chrome_options.add_argument('--headless') 

drive=webdriver.Chrome(chrome_options=chrome_options, executable_path='./chromedriver.exe')
drive.get('https://www.baidu.com/')
print(drive.page_source)
drive.close()   # 不打開瀏覽器頁面，但是進程是開啟的，如果請求後不關閉，後面再執行，進程就越開越多

三、基本使用

#模擬登錄百度

from selenium import webdriver
from selenium.webdriver.common.by import By                 #按照什麼方式查找，By.ID, By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys             #鍵盤按鍵操作
import time

drive=webdriver.Chrome(executable_path='./chromedriver.exe')
# 1、模擬輸入-->搜索
drive.get('https://www.baidu.com/')
time.sleep(0.01)
input_search=drive.find_element(By.ID, 'kw')     # 按id查找，找到搜索的input框
input_search.send_keys('美女')                   # 在框里寫入美女，可以從自己資料庫拿出來動態輸入
time.sleep(2)
方式1：
sou=drive.find_element(By.ID, 'su')              # 找到搜索按鈕
sou.click()                                      # 點擊搜索按鈕
方式2：模擬鍵盤操作，直接輸入回車
input_search.send_keys(Keys.ENTER)
time.sleep(3)
drive.close()


# 2、模擬登錄
drive=webdriver.Chrome(executable_path='./chromedriver.exe')
# 隱式等待：找一個控制項，如果控制項沒有載入出來，等待5s，如果超過5秒沒載入出來報錯
# 隱式等待是等待所有，只需要寫這一句，以後找所有控制項都按這個操作來
drive.implicitly_wait(5)  
drive.get('https://www.baidu.com/')

# 登錄按鈕a標簽，按a標簽內容'登錄'查找到這個a標簽
login_button=drive.find_element(By.LINK_TEXT, '登錄')
login_button.click()  

# 找到用戶名密碼登錄按鈕並點擊,如果後面的控制項還沒有載入出來就點擊會報錯，所以用了隱式等待
login_u=drive.find_element(By.ID, 'TANGRAM__PSP_11__footerULoginBtn')
login_u.click()

# 找到用戶名密碼輸入框，輸入用戶名密碼
username=drive.find_element(By.ID， 'TANGRAM__PSP_11__userName')
username.send_keys('yxp654799481')
password=drive.find_element(By.ID， 'TANGRAM__PSP_11__password')
password.send_keys('yxp997997')
time.sleep(3)

# 找到並點擊提交按鈕
submit=drive.find_element(By.ID， 'TANGRAM__PSP_11__submit')
submit.click()
time.sleep(10)
# 拿到登錄的cookie(建cookie池，用requests模塊發請求)，如果沒登錄進去會返回未登錄的cookie
print(drive.get_cookies())

drive.close()

四、選擇器

1、selenium內置的選擇器

1、find_element_by_id   # 通過id查找控制項
   新版 find_element(By.ID, 'search_input') 
    
2、find_element_by_link_text  # 通過標簽內容（文本）找
	新版 find_element(By.LINK_TEXT, 'hao123')
    
3、find_element_by_partial_link_text  # 通過標簽內容找，模糊匹配
	新版 find_element(By.PARTIAL_LINK_TEXT, 'hao')

4、find_element_by_tag_name   # 通過標簽名查找
	新版 find_element(By.TAG_NAME, 'title')

5、find_element_by_class_name  # 通過類名查找
	新版 find_element(By.CLASS_NAME, 's_ipt') 
    
6、find_element_by_name        # name屬性
	新版 find_element(By.NAME, 'wd') 

7、find_element_by_css_selector  # 通過css選擇器
	新版 find_element(By.CSS_SELECTOR, '#kw') 
    
8、find_element_by_xpath       # 通過xpaht選擇器
	新版 find_element(By.XPATH, '//*[@id="kw"]') 
    
# 強調：
# 新版用法，先聲明By模塊，from selenium.webdriver.common.by import By
# find_element_by_xxx  單數只找第一個
# find_elements_by_xxx 複數是查找到所有，結果為列表

2、xpath選擇器

介紹
xpath: XPath 是一門在 XML 文檔中查找信息的語言
/  :從根節點選取。類似bs4的遍歷文檔樹
// :不管位置，直接找
.  :選取當前節點
/@屬性名  ：取當前標簽的屬性
/text()  : 取當前標簽的文本
瀏覽器調試可賦值css和xpath

# 示例文檔
doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' aa='bb'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''

from lxml import etree

html=etree.HTML(doc)
html=etree.parse('search.html',etree.HTMLParser())  # 直接讀取文本文件進行解析

# 1 找出所有節點，拿到一個列表，裡面是所有標簽對象
res=html.xpath('//*') 

# 2 指定節點（結果為列表），找head標簽
res=html.xpath('//head')

# 3 子節點，子孫節點
a=html.xpath('//div/a')   # 找所有div下的a
a=html.xpath('//body/a')  # 無數據，body的子節點沒有a標簽，找不到
a=html.xpath('//body//a') # body為根節點就能找到所有a標簽

# 4 父節點
a=html.xpath('//body//a[@href="image1.html"]/..')   # a標簽按herf屬性="image1.html"過濾
a=html.xpath('//body//a[1]/..')                     # 取第一個a標簽，/..找父節點
# 也可以這樣
a=html.xpath('//body//a[1]/parent::*')

# 5 屬性匹配
a=html.xpath('//body//a[@href="image1.html"]') 

# 6 文本獲取(重要)  /text() 取當前標簽的文本
a=html.xpath('//body//a[@href="image1.html"]/text()')   # 取出1個a標簽文本放在列表裡
a=html.xpath('//body//a/text()')                        # 取出所有a標簽文本，放在列表

# 7 屬性獲取  @href 取當前標簽的href屬性
a=html.xpath('//body//a/@href')   # 取出所有a標簽href屬性，放在列表

# 註意索引從1開始（不是從0）
a=html.xpath('//body//a[1]/@href') 

# 8 屬性多值匹配
#  a 標簽有多個class類，class='li li-item',直接匹配就不可以了，需要用contains
a=html.xpath('//body//a[@class="li"]')            # 還差一個class屬性，取不到
a=html.xpath('//body//a[contains(@class,"li")]')  # class屬性包含li，能取到a標簽
a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 9 多屬性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')            # 或的關係，能拿出兩個
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')    #只能拿出一個
a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 10 按序選擇 
a=html.xpath('//a[2]/text()')            # 第二個a標簽
a=html.xpath('//a[2]/@href')
a=html.xpath('//a[last()]/@href')         # 取最後一個
a=html.xpath('//a[position()<3]/@href')   # 位置小於3的
a=html.xpath('//a[last()-2]/@href')       # 倒數第二個

# 11 節點軸選擇
# ancestor：祖先節點
a=html.xpath('//a/ancestor::*')     # 使用了* 獲取所有祖先節點
a=html.xpath('//a/ancestor::div')   # 獲取祖先節點中的div

# attribute：屬性值
a=html.xpath('//a[1]/attribute::*')  # 拿出第一個a標簽的所有屬性
a=html.xpath('//a[1]/@aa')           # 拿出第一個a標簽的aa屬性

# child：直接子節點
a=html.xpath('//a[1]/child::*')
a=html.xpath('//a[1]/child::img/@src') # 拿第一個a標簽img子節點的src屬性

# descendant：所有子孫節點
a=html.xpath('//a[6]/descendant::*')
a=html.xpath('//a[6]/descendant::h5/text()')

# following:當前節點之後所有節點(兄弟節點和兄弟內部的節點)
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')

# following-sibling:當前節點之後同級節點（只找兄弟）
a=html.xpath('//a[1]/following-sibling::*')
a=html.xpath('//a[1]/following-sibling::a')
a=html.xpath('//a[1]/following-sibling::*[2]')
a=html.xpath('//a[1]/following-sibling::*[2]/@href')
print(a)

瀏覽器network找到需要的標簽，複製selector(css選擇器)，複製xpath(xpath選擇器)
//以後去查找標簽，bs4的find，  css，xpath（通用的）

3、獲取標簽屬性

# 重點
tag.get_attribute('href')   # tag是查找出來的標簽(控制項)，get_attribute找當前控制項的href屬性對的值
tag.text                    # 獲取文本內容
tag.location                # 當前控制項在頁面位置，應用：驗證碼截圖

# 瞭解
tag.id        # 當前控制項id號
tag.tag_name  # 標簽名
tag.size      # 標簽的大小

4、等待元素被載入

#1、selenium只是模擬瀏覽器的行為，而瀏覽器解析頁面是需要時間的（執行css，js），一些元素可能需要過一段時間才能載入出來，為了保證能查找到元素，必須等待

#2、等待的方式分兩種：
隱式等待：在browser.get（'xxx'）前就設置，針對所有元素有效
顯式等待：在browser.get（'xxx'）之後設置，只針對某個元素有效

#隱式等待:在查找所有元素時，如果尚未被載入，則等10秒
browser.implicitly_wait(10)

5、元素交互操作

5.1 基本操作
tag.send_keys()  # 往裡面寫內容(搜索框、input框)
tag.click()      # 點擊控制項(提交按鈕)
tag.clear()      # 清空控制項內容


5.2、自己寫js
# 有一些操作需要藉助js來做，交互動作難實現，自己手寫js代碼，比如寫js代碼取出cookie
from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(5) 
bro.get('https://www.baidu.com/')

bro.execute_script('alter('hello')') # 裡面寫js代碼
bro.execute_script('window.open()')  # js操作新打開一個頁面
bro.execute_script('window.open()')  # js操作再新打開一個頁面
time.sleep(2)
bro.close()


5.3 ActionChains 動作鏈(瞭解)
# 移動x軸和y軸的距離
ActionChains(driver).move_by_offset(xoffset=2,yoffset=2).perform()

# 直接移動到某個控制項上
ActionChains(driver).move_to_elment().perform()

# 在某個控制項上,移動X軸與y軸的距離
ActionChains(driver).move_to_element_with_offset(img, xoffset, yoffset).perform()

# 點擊一個控制項，並按住不動，相當於滑鼠左鍵一直按著，再移動距離就實現了拖動
# 用來模擬滑動驗證，但是滑動驗證後臺規則設計複雜，已經很難破解了
ActionChains(driver).click_and_hold(sourse).perform()

# 釋放動作鏈
ActionChains(driver).release().perform()


5.4 frame的切換，現在很少了，相當於1個瀏覽器雙開了2個頁面，可以相互切換
bro.switch_to.frame('iframeResult') #切換到id為iframeResult的frame上


5.5 如何把屏幕拉到最下（js控制）
# scrollTo(起始位，終止位) 函數
bro.execute_script('window.scrollTo(0,document.body.offsetHeight)')

五、其他

1、模擬瀏覽器前進和後退

from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.baidu.com')
bro.get('https://www.taobao.com')
bro.get('http://www.sina.com.cn/')

bro.back()
time.sleep(1)
brow.forward()
bro.close()

2、cookies

bro.get_cookies()                       # 獲取cookie
bro.add_cookie({'k1':'xxx','k2':'yyy'}) # 已經有cookie，登錄時自動帶cookie登錄(瞭解)
bro.delete_all_cookies()                # 刪除瀏覽器中的所有cookie(瞭解)

3、選項卡管理(瞭解)

from selenium import webdriver
import time
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')               # 執行js代碼，新打開一個標簽頁面

print(browser.window_handles)                         #獲取所有的選項卡
browser.switch_to_window(browser.window_handles[1])   # 移動到新打開的標簽上
browser.get('https://www.taobao.com')
time.sleep(2)
browser.switch_to_window(browser.window_handles[0])   # 移動到舊的標簽上
browser.get('https://www.sina.com.cn')
browser.close()                                        # 只關閉了0頁面，還要切到1上再關閉1的標簽

4、異常處理

from selenium import webdriver
# selenlium中的異常分幾類，不用分這麼細，總的捕獲異常就行了，所有操作放在try中
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

browser=webdriver.Chrome()
try:
  browser.get('')
except Exception as e:
     print(e)
finally:  # 無論是否出異常，最終都要關掉
    browser.close()