1 For the asynchronously loaded html page, the page source code data xpath cannot be found
1.0 Website Analysis
#Taobao search page URL: https://s.taobao.com/search?q=mobile phone #Search list page analysis: First page: https://s.taobao.com/search?q=mobile phone Second page: All are generated by ajax requests Last page: all generated by ajax requests Request method get Return data as html
1.1 Create project
scrapy startproject taobaoSpider cd ssqSpider scrapy genspider taobao taobao.com
1.2 Create a crawler
scrapy genspider taobao "taobao.com"
1.3 Add tool function module utils.py
from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.common import exceptions import json def create_chrome_driver(headless=False): options=webdriver.ChromeOptions() if headless: options.add_argument('--headless') #Remove the prompt that chrome is being controlled by the test software options.add_experimental_option('excludeSwitches',['enable-automation']) options.add_experimental_option('useAutomationExtension',False) options.add_argument("--disable-blink-features=AutomationControlled") # Define the address of the chrome driver service = Service('chromedriver.exe') # service=Service(r'E:\Project Area\Project 2023-Programming Project Tutorial\ssq_caipiao_pachong\taobaoSpider\chromedriver.exe') browser=webdriver.Chrome(service=service,options=options) #Reverse crawling, change to navigator.webdriver=undefined browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source':'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'}) return browser def add_cookies(browser,cookie_file): with open(cookie_file,'r') as file: cookies_list=json.load(file) for cookie_dict in cookies_list: if cookie_dict['secure']: try: browser.add_cookie(cookie_dict) except exceptions.InvalidCookieDomainException as e: print(e.msg) def test(): print("ggggggg")
1.4 Test Taobao page anti-crawling mechanism
1.4.1 taobao_login.py simulates login and generates cookies.json
from utils import create_chrome_driver,add_cookies,test import json import time from selenium.webdriver.common.by import By browser=create_chrome_driver() time.sleep(1) # browser.get('https://taobao.com') # time.sleep(1) # el=browser.find_element(by=By.XPATH,value='//*[@id="q"]') # el.send_keys('mobile phone') # time.sleep(1) # el=browser.find_element(by=By.XPATH,value='//*[@id="J_TSearchForm"]/div[1]/button') #el.click() # time.sleep(1) # # #Scroll to the bottom # # js="window.scrollTo(0,450);" # # driver.execute_script(js) # # sleep(3) # # or # js = "var q=document.documentElement.scrollTop=4514" # browser.execute_script(js) # time.sleep(1) # # Click next page # el=browser.find_element(by=By.XPATH,value='//*[@id="root"]/div/div[3]/div[1]/div[1]/div[ 2]/div[4]/div/div/button[2]') #el.click() # time.sleep(1) # browser.get('https://s.taobao.com/search?commend=all & amp;ie=utf8 & amp;initiative_id=tbindexz_20170306 & amp;q=mobile& amp;search_type=item & amp;sourceId =tb.index & amp;spm=a21bo.jianhua.201856-taobao-item.2 & amp;ssid=s5-e') #Enter login page browser.get('https://login.taobao.com/member/login.jhtml') time.sleep(1) el=browser.find_element(by=By.XPATH,value='//*[@id="fm-login-id"]') el.send_keys('[email protected]') el=browser.find_element(by=By.XPATH,value='//*[@id="fm-login-password"]') el.send_keys('123456') el=browser.find_element(by=By.XPATH,value='//*[@id="login-form"]/div[4]/button') el.click() time.sleep(6) #save cookie with open('taobao_cookie.json','w') as file: json.dump(browser.get_cookies(), file) time.sleep(1) # print(browser.page_source) browser.get('https://s.taobao.com/search?q=手机') time.sleep(1) time.sleep(600)
1.4.2 taobao_login_after.py Taobao login test
from utils import create_chrome_driver,add_cookies,test import json import time from selenium.webdriver.common.by import By browser=create_chrome_driver() time.sleep(1) # Visit the page first and then set the cookie, otherwise an error will be reported browser.get('https://taobao.com') time.sleep(1) add_cookies(browser,'taobao_cookie.json') time.sleep(1) browser.get('https://s.taobao.com/search?q=手机') time.sleep(1) time.sleep(600)
1.5 Modify download middleware
from scrapy import signals from scrapy.http import HtmlResponse import time # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter from utils import create_chrome_driver,add_cookies from taobaoSpider.spiders.taobao import TaobaoSpider class TaobaospiderDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def __init__(self): self.browser=create_chrome_driver() self.browser.get('https://www.taobao.com') add_cookies(self.browser, 'taobao_cookie.json') def __del__(self): self.browser.close() pass def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called if not isinstance(spider, TaobaoSpider): return None else: self.browser.get(request.url) time.sleep(2) # # #Scroll to the bottom # js="window.scrollTo(0,450);" # self.browser.execute_script(js) # sleep(3) # # or # js = "var q=document.documentElement.scrollTop=4514" # self.browser.execute_script(js) # time.sleep(2) #Scroll slowly for i in range(45,4514,400): js = f"var q=document.documentElement.scrollTop={i}" self.browser.execute_script(js) time.sleep(0.5) # # Click next page # el=browser.find_element(by=By.XPATH,value='//*[@id="root"]/div/div[3]/div[1]/div[1]/div[ 2]/div[4]/div/div/button[2]') #el.click() # time.sleep(1) return HtmlResponse(url=request.url,body=self.browser.page_source, request=request,encoding='utf-8') def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info("Spider opened: %s" % spider.name)
Modify download middleware configuration
DOWNLOADER_MIDDLEWARES = { "taobaoSpider.middlewares.TaobaospiderDownloaderMiddleware": 543, }
1.6 Modify the crawler code
1.6.1 Add data model
import scrapy class TaobaospiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() pass
1.6.2 Modify crawler code
import scrapy from scrapy import Request from scrapy.http import HtmlResponse from taobaoSpider.items import TaobaospiderItem class TaobaoSpider(scrapy.Spider): name = "taobao" allowed_domains = ["taobao.com"] # start_urls = ["https://taobao.com"] def start_requests(self): keywords=['Mobile phone','Laptop computer','Keyboard and mouse set'] keywords=['mobile phone'] for keyword in keywords: for page in range(1): url=f'https://s.taobao.com/search?q={keyword} & amp;s={page*48}' yieldRequest(url) def parse(self, response:HtmlResponse): # print(response.text) cssitem_list=response.xpath('//*[@id="root"]/div/div[3]/div[1]/div[1]/div[2]/div[3]/div /div') # print(len(cssitem_list)) for cssitem in cssitem_list: item=TaobaospiderItem() item['title']=cssitem.xpath('./a/div/div[1]/div[2]/div/span/text()').extract() yield item pass
1.6.3 Test running crawler
scrapy crawl taobao #Official operation or scrapy crawl taobao -o taobao.csv