A practical example of python selenium, heavier than the demo, but not too complicated.
The trick is summarized as follows:
The address of the latest chromedriver is https://googlechromelabs.github.io/chrome-for-testing. This is very important, otherwise you will have to deal with annoying problems such as automatic chrome updates. Many download sources are a bit outdated.
Use options to configure the address of the webdriver and the socks5 proxy.
driver.page_source can print the current html and assist in judging the execution progress.
XPath (XML Path Language) is a language for navigating through elements and attributes in XML documents and is a w3c standard. HTML is standard XML, so HTML can also use XPath. The XPATH selector works relatively stably.
Python has a traceback module. Print(traceback.format_exc()) can raise the exception as it is and ignore the exception.
In the crawler project, element not interactable is encountered. It may be that the element is not visible when you try to click it.
The solution is to add a until.elementIsVisible judgment after until.elementLocated.
The reason why I did this is that the web page has not been adapted to the page. If the window is too small, the button will not be visible. Therefore, the fundamental solution is to maximize the window. The above judgment can only play a role in problem diagnosis rather than treatment.
Replace sleep with until to be more elegant.
# If the front-end page is changed, the corresponding xpath needs to be modified. If the page changes significantly, the code needs to be modified. import selenium.common.exceptions from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait # Determine whether we need to continue waiting based on the web page title, URL and whether the element exists/visible/clickable and other conditions from selenium.webdriver.support import expected_conditions as EC import configparser import csv from time import sleep # Todo is written in an object-oriented way and adds a test connection like this # todo constants, user, password, change to configuration file class Spider: def __init__(self): self.config = configparser.ConfigParser() self.config.read('config.ini') options = Options() options.binary_location = self.config['envs']['chrome_driver_location'] options.add_argument('--proxy-server=' + self.config['envs']['proxy']) options.add_argument('--start-maximized') self.driver = webdriver.Chrome(options) def request(self, url): self.driver.get(url) def test_page(self): print(self.driver.page_source) def login(self): wait = WebDriverWait(self.driver, 10) username = self.config['envs']['username'] password = self.config['envs']['password'] xpath_username = self.config['xpath']['username_input'] xpath_password = self.config['xpath']['password_input'] xpath_login = self.config['xpath']['login_button'] username_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_username))) password_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_password))) username_input.send_keys(username) password_input.send_keys(password) signin_button = self.driver.find_element(By.XPATH, xpath_login) signin_button.click() def close(self): self.driver.quit() def export_running_job(self): login=False wait = WebDriverWait(self.driver, 10) xpath_table = self.config['xpath']['table_body'] xpath_template = self.config['xpath']['detail_button'] xpath_jobname = self.config['xpath']['jobname_input'] xpath_email = self.config['xpath']['email_input'] xpath_groupid = self.config['xpath']['groupid_input'] xpath_topiclist = self.config['xpath']['topiclist_pre'] xpath_next = self.config['frontend']['next_button'] xpath_close = self.config['xpath']['close_button'] for key in self.config['urls']: url_login = self.config['login'][key] url_flink = self.config['urls'][key] if not login: self.request(url_login) self.login() login=True self.request(url_flink) self.driver.maximize_window() # print(xpath_template) rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, xpath_table))) jobname_ls, email_ls, groupid_ls, topic_ls = [], [], [], [] # The monitor that comes with mac is relatively small, so it requires some operations to see the buttons. self.driver.set_window_size(1800, 900) self.driver.set_window_position(-700, 10) while True: segments = xpath_template.split('/') for i in range(1, len(rows) + 1): segments[-3] = 'tr[' + str(i) + ']' xpath_item = '/'.join(segments) # print(xpath_item) detail_button = wait.until(EC.visibility_of_element_located((By.XPATH, xpath_item))) detail_button.click() jobname = wait.until(EC.presence_of_element_located((By.XPATH, xpath_jobname))) self.driver.execute_script("arguments[0].removeAttribute('disabled')", jobname) jobname_ls.append(jobname.get_attribute("value")) email = wait.until(EC.presence_of_element_located((By.XPATH, xpath_email))) self.driver.execute_script("arguments[0].removeAttribute('disabled')", email) email_ls.append(email.get_attribute("value")) groupid = wait.until(EC.presence_of_element_located((By.XPATH, xpath_groupid))) self.driver.execute_script("arguments[0].removeAttribute('disabled')", groupid) groupid_ls.append(groupid.get_attribute("value")) topiclist = wait.until(EC.presence_of_element_located((By.XPATH, xpath_topiclist))) topic_ls.append(topiclist.get_attribute("innerHTML")) close_button = wait.until(EC.visibility_of_element_located((By.XPATH, xpath_close))) close_button.click() if len(rows) == int(self.config['frontend']['table_maxsize']): next_button = self.driver.find_element(By.XPATH, xpath_next) next_button.click() else: break with open(key + '.csv', 'wt') as f: cw = csv.writer(f, lineterminator='\\ ') for i in range(len(groupid_ls)): cw.writerow([jobname_ls[i], email_ls[i], groupid_ls[i], topic_ls[i]]) self.close() if __name__ == '__main__': spider = Spider() spider.export_running_job()