import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from sele... ...
import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq from config import * import pymongo client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] # browser = webdriver.PhantomJS(service_args=SERVER_ARGS) browser = webdriver.Chrome() wait = WebDriverWait(browser,10) # browser.set_window_size(1400,900) def search(): print('SEARCHING...') try: browser.get('https://www.taobao.com') input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR,'#q')) ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))) input.send_keys(KEYWORD) submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) get_products() return total.text except TimeoutException: return search() def next_page(page_number): print('PAGE TURNING...', page_number) try: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) ) submit = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')) ) input.clear() input.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) get_products() except TimeoutException: next_page(page_number) def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: products = { 'image': item.find('.pic .img').attr('src'), 'price': item.find('.price').text(), 'deal': item.find('.deal-cnt').text()[:-3], 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(products) save_to_mongo(products) def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print('SUCCESSD!', result) except Exception: print('FAILD!', result) def main(): try: total = search() total = int(re.compile('(\d+)').search(total).group(1)) # print(total) for i in range (2, total + 1): next_page(i) except Exception: print('error...') finally: browser.close() if __name__ == '__main__': main()