Scrapy framework collects top data of Amazon products

Scrapy’s crawlSpider crawler

1. What is crawlSpider?

crawlSpider is a derived class of Scrapy. The design principle of the Spider class is to only crawl the webpages in the start_url list, and the crawlSpider class defines some rules (rules) to provide a convenient mechanism for following up links. Get the link from the crawled web page and continue to crawl
crawlSpider can match the url address that meets the conditions, assemble it into a Request object and send it to the engine automatically, and can specify a callback function at the same time
That is: CrawlSpider crawlers can automatically obtain connections according to the rules

2. Create a crawlSpider crawler:

scrapy genspider -t crawl crawler name domain name

That is, create the amazon crawler command:
scrapy genspider – he crawl amzonTop amazon.com

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class TSpider(CrawlSpider):
    name = 'amzonTop'
    allowed_domains = ['amazon.com']
    start_urls = ['https://amazon.com/']

    rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {<!-- -->}
        # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        # item['name'] = response.xpath('//div[@id="name"]').get()
        # item['description'] = response.xpath('//div[@id="description"]').get()
        return item

rurles is a tuple or list containing Rule objects
Rule represents a rule, which contains parameters such as LinkExtractor, callback and follow

LinkExtractor: link extractor, which can match url addresses through regular, xpath, css
callback: the callback function of the extracted url address response, it can be absent, if it is not, it means that the response will not be processed by the callback function
follow: Whether the response corresponding to the extracted url address will continue to be extracted by the rules in the rules. True: means yes. False means no

3. Crawl Amazon product data:

1. Create amaozn crawler:

scrapy genspider -t crawl amazonTop2 amazon.com
Project structure:

  1. Extract the url of the product list page turning page and the url of the product detail page

Extract all product Asin and rank (commodity ranking) on the product list page — get the Asin and rank of all blue boxes
Extract all color Asins and specifications of Asins on the product details page —— that is, get all the Asins in the green frame, and the green frame includes the Asin in the blue frame


Green box: It is like the size of clothes on a shopping website S M L XL XXL

Crawler file: amzonTop2.py

import datetime
import re
import time
from copy import deepcopy

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class Amazontop2Spider(CrawlSpider):
    name = 'amazonTop2'
    allowed_domains = ['amazon.com']
    # https://www.amazon.com/Best-Sellers-Tools-Home-Improvement-Wallpaper/zgbs/hi/2242314011/ref=zg_bs_pg_2?_encoding=UTF8 &pg=1
    start_urls = ['https://amazon.com/Best-Sellers-Tools-Home-Improvement-Wallpaper/zgbs/hi/2242314011/ref=zg_bs_pg_2']
    # rule extract url
    rules = [
        Rule(LinkExtractor(restrict_css=('.a-selected','.a-normal')), callback='parse_item', follow=True),
    ]

    def parse_item(self, response):
        asin_list_str = "".join(response.xpath('//div[@class="p13n-desktop-grid"]/@data-client-recs-list').extract())
        if asin_list_str:
            asin_list = eval(asin_list_str)
            for asinDict in asin_list:
                item = {<!-- -->}
                if "'id'" in str(asinDict):
                    listProAsin = asinDict['id']
                    pro_rank = asinDict['metadataMap']['render.zg.rank']
                    item['rank'] = pro_rank
                    item['ListAsin'] = listProAsin
                    # Product details page link
                    item['rankAsinUrl'] =f"https://www.amazon.com/Textile-Decorative-Striped-Corduroy-Pillowcases/dp/{<!-- -->listProAsin}/ref=zg_bs_3732341_sccl_1/136-3072892- 8658650?psc=1"

                    print("-"*30)
                    print(item)
                    print('-'*30)
                    yield scrapy.Request(item["rankAsinUrl"], callback=self.parse_list_asin,
                                         meta={<!-- -->"main_info": deepcopy(item)})


    def parse_list_asin(self, response):
        """
        Get all category sub-asins of a single product
        :param response:
        :return:
        """
        news_info = response.meta["main_info"]
        list_ASIN_all_findall = re.findall('"colorToAsin":(.*?),"refactorEnabled":true,', str(response.text))
        try:
            try:
                parentASIN = re.findall(r',"parentAsin":"(.*?)",', str(response.text))[-1]
            except:
                parentASIN = re.findall(r' & amp;parentAsin=(.*?) & amp;', str(response.text))[-1]
        except:
            parentASIN = ''
        # parentASIN = parentASIN[-1] if parentASIN !=[] else ""
        print("parentASIN:",parentASIN)
        if list_ASIN_all_findall:
            list_ASIN_all_str = "".join(list_ASIN_all_findall)
            list_ASIN_all_dict = eval(list_ASIN_all_str) # convert to dictionary
            for asin_min_key, asin_min_value in list_ASIN_all_dict.items():
                if asin_min_value:
                    asin_min_value = asin_min_value['asin']
                    news_info['parentASIN'] = parentASIN
                    news_info['secondASIN'] = asin_min_value # sub-asin of a single product category
                    news_info['rankSecondASINUrl'] = f"https://www.amazon.com/Textile-Decorative-Striped-Corduroy-Pillowcases/dp/{<!-- -->asin_min_value}/ref=zg_bs_3732341_sccl_1/136-3072892- 8658650?psc=1"
                    yield scrapy.Request(news_info["rankSecondASINUrl"], callback=self.parse_detail_info,meta={<!-- -->"news_info": deepcopy(news_info)})


    def parse_detail_info(self, response):
        """
        Get product details page information
        :param response:
        :return:
        """
        item = response.meta['news_info']
        ASIN = item['secondASIN']
        # print('---------------------------------------------- ----------------------------------------------')
        # with open('amazon_h.html', 'w') as f:
        # f.write(response.body.decode())
        # print('---------------------------------------------- ----------------------------------------------')
        pro_details = response.xpath('//table[@id="productDetails_detailBullets_sections1"]//tr')

        pro_detail = {<!-- -->}
        for pro_row in pro_details:
            pro_detail[pro_row.xpath('./th/text()').extract_first().strip()] = pro_row.xpath('./td//text()').extract_first().strip()

        print("pro_detail",pro_detail)
        ships_from_list = response.xpath(
            '//div[@tabular-attribute-name="Ships from"]/div//span//text()').extract()
        # Logistic party
        try:
            delivery = ships_from_list[-1]
        except:
            delivery = ""
        seller = "".join(response.xpath('//div[@id="tabular-buybox"]//div[@class="tabular-buybox-text"][3]//text()') .extract()).strip().replace("'", "") # seller
        if seller == "":
            seller = "".join(response.xpath('//div[@class="a-section a-spacing-base"]/div[2]/a/text()').extract()).strip ().replace("'", "") # seller
        seller_link_str = "".join(response.xpath('//div[@id="tabular-buybox"]//div[@class="tabular-buybox-text"][3]//a/@href' ).extract()) # seller link
        # if seller_link_str:
        # seller_link = "https://www.amazon.com" + seller_link_str
        # else:
        #seller_link = ''
        seller_link = "https://www.amazon.com" + seller_link_str if seller_link_str else ''

        brand_link = response.xpath('//div[@id="bylineInfo_feature_div"]/div[@class="a-section a-spacing-none"]/a/@href').extract_first() # brand link
        pic_link = response.xpath('//div[@id="main-image-container"]/ul/li[1]//img/@src').extract_first() # picture link
        title = response.xpath('//div[@id="titleSection"]/h1//text()').extract_first() # title
        star = response.xpath('//div[@id="averageCustomerReviews_feature_div"]/div[1]//span[@class="a-icon-alt"]//text()').extract_first(). strip() # stars
        # selling price
        try:
            price = response.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span[2]/span[@class="a-offscreen"]//text ()'). extract_first()
        except:
            try:
                price = response.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span[1]/span[@class="a-offscreen"]//text ()'). extract_first()
            except:
                price = ''
        size = response.xpath('//li[@class="swatchSelect"]//p[@class="a-text-left a-size-base"]//text()').extract_first() # size
        # color
        key_v = str(pro_detail. keys())
        brand = pro_detail['Brand'] if "Brand" in key_v else '' # brand
        if brand == '':
            brand = response.xpath('//tr[@class="a-spacing-small po-brand"]/td[2]//text()').extract_first().strip()
        elif brand == "":
            brand = response.xpath('//div[@id="bylineInfo_feature_div"]/div[@class="a-section a-spacing-none"]/a/text()').extract_first().replace( "Brand: ", "").replace("Visit the", "").replace("Store", '').strip()

        color = pro_detail['Color'] if "Color" in key_v else ""
        if color == "":
            color = response.xpath('//tr[@class="a-spacing-small po-color"]/td[2]//text()').extract_first()
        elif color == '':
            color = response.xpath('//div[@id="variation_color_name"]/div[@class="a-row"]/span//text()').extract_first()
        # pattern
        pattern = pro_detail['Pattern'] if "Pattern" in key_v else ""
        if pattern == "":
            pattern = response.xpath('//tr[@class="a-spacing-small po-pattern"]/td[2]//text()').extract_first().strip()
        # material material
        try:
            material = pro_detail['Material']
        except:
            material = response.xpath('//tr[@class="a-spacing-small po-material"]/td[2]//text()').extract_first().strip()
        # shape shape
        shape = pro_detail['Shape'] if "Shape" in key_v else ""
        if shape == "":
            shape = response.xpath('//tr[@class="a-spacing-small po-item_shape"]/td[2]//text()').extract_first().strip()
        # style # style
        # five point description
        five_points = response.xpath('//div[@id="feature-bullets"]/ul/li[position()>1]//text()').extract_first().replace(""", "'")
        size_num = len(response.xpath('//div[@id="variation_size_name"]/ul/li').extract()) # size number
        color_num = len(response.xpath('//div[@id="variation_color_name"]//li').extract()) # number of colors
        # variant_num = # number of variants
        # style # style link
        #manufacturer
        # manufacturers
        try:
            Manufacturer = pro_detail['Manufacturer'] if "Manufacturer" in str(pro_detail) else " "
        except:
            Manufacturer = ""
        item_weight = pro_detail['Item Weight'] if "Weight" in str(pro_detail) else '' # item weight
        product_dim = pro_detail['Product Dimensions'] if "Product Dimensions" in str(pro_detail) else '' # product size
        #product_material
        # product quality
        try:
            product_material = pro_detail['Material']
        except:
            product_material = ''
        # fabric_type
        # fabric composition
        try:
            fabric_type = pro_detail['Fabric Type'] if "Fabric Type" in str(pro_detail) else " "
        except:
            fabric_type = ""

        star_list = response.xpath('//table[@id="histogramTable"]//tr[@class="a-histogram-row a-align-center"]//td[3]//a/text( )').extract()
        if star_list:
            try:
                star_1 = star_list[0].strip()
            except:
                star_1 = 0
            try:
                star_2 = star_list[1].strip()
            except:
                star_2 = 0
            try:
                star_3 = star_list[2].strip()
            except:
                star_3 = 0
            try:
                star_4 = star_list[3].strip()
            except:
                star_4 = 0
            try:
                star_5 = star_list[4].strip()
            except:
                star_5 = 0

        else:
            star_1 = 0
            star_2 = 0
            star_3 = 0
            star_4 = 0
            star_5 = 0

        if "Date First Available" in str(pro_detail):
            data_first_available = pro_detail['Date First Available']
            if data_first_available:
                data_first_available = datetime.datetime.strftime(
                    datetime.datetime.strptime(data_first_available, '%B %d, %Y'), '%Y/%m/%d')
            else:
                data_first_available = ""
        reviews_link = f'https://www.amazon.com/MIULEE-Decorative-Pillowcase-Cushion-Bedroom/product-reviews/{<!-- -->ASIN}/ref=cm_cr_arp_d_viewopt_fmt?ie=UTF8 & amp;reviewerType =all_reviews &formatType=current_format &pageNumber=1'
        # reviews_num, ratings_num # Number of reviews, ratings
        scrap_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        item['delivery'] = delivery
        item['seller'] = seller
        item['seller_link'] = seller_link
        item['brand_link'] = brand_link
        item['pic_link'] =pic_link
        item['title']=title
        item['brand'] = brand
        item['star'] = star
        item['price'] = price
        item['color']=color
        item['pattern'] = pattern
        item['material']=material
        item['shape'] = shape
        item['five_points'] = five_points
        item['size_num'] = size_num
        item['color_num']=color_num
        item['Manufacturer'] = Manufacturer
        item['item_weight'] = item_weight
        item['product_dim'] = product_dim
        item['product_material'] = product_material
        item['fabric_type']=fabric_type
        item['star_1']=star_1
        item['star_2']=star_2
        item['star_3']=star_3
        item['star_4']=star_4
        item['star_5']=star_5
        # item['ratings_num'] = ratings_num
        # item['reviews_num'] = reviews_num
        item['scrap_time'] = scrap_time
        item['reviews_link']=reviews_link
        item['size'] = size
        item['data_first_available'] = data_first_available

        yield item

When the number of collections reaches a certain amount, change the ip, identify the verification code, etc.

4. Download middleware

Downloader Middlewares default method:

  • process_request(self, request, spider):

    1. This method is called when each request passes through the download middleware
    2. Return None value: return None if there is no return, the request object is passed to the downloader, or passed to other low-weight process_request methods through the engine
    3. Return the Response object: no more requests, return the response to the engine
    4. Return the Request object: pass the request object to the scheduler through the engine, and will not pass other low-weight process_request methods at this time

  • process_response(self, request, response, spider):

    1. Called when the downloader completes the http request and passes the response to the engine
    2. Return to Resposne: hand it over to the crawler through the engine or hand it over to the process_response method of other download middleware with lower weight
    3. Return the Request object: pass the engine to the caller to continue the request, and will not pass other low-weight process_request methods at this time
    Configure and enable middleware in settings.py, the smaller the weight value, the higher the execution priority

middlewares.py

  1. Set proxy replacement ip
class ProxyMiddleware(object):
    def process_request(self, request, spider):
        # Set the proxy to fill in according to the specific use
        request.meta['proxy'] = proxyServer
        # Set authentication
        request.header["Proxy-Authorization"] = proxyAuth

    # Check if the proxy ip is available
    def process_response(self, request, response, spider):
        if response.status != '200':
            request.dont_filter = True # Resent request objects can enter the queue again
            # Return the object to the engine, and the engine restarts the process_request of the first middleware from scratch
            return request # Return request, the middleware terminates, the request returns to the engine and then to the scheduler
  1. Replace User-Agent or cookie
class AmazonspiderDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        returns

    def process_request(self, request, spider):
    # USER_AGENTS_LIST: setting.py
        user_agent = random.choice(USER_AGENTS_LIST)
        request. headers['User-Agent'] = user_agent
        cookies_str = 'Cookies pasted by the browser'
        # Convert cookies_str to cookies_dict
        cookies_dict = {<!-- -->i[:i.find('=')]: i[i.find('=') + 1:] for i in cookies_str.split('; ')}
        request.cookies = cookies_dict
        # print("------------------------------------------- -----")
        # print(request. headers)
        # print("------------------------------------------- -----")
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
  1. amazon verification code
def captcha_verfiy(img_name):
    # identify verification code
    reader = easyocr. Reader(['ch_sim', 'en'])
    # reader = easyocr. Reader(['en'], detection='DB', recognition = 'Transformer')
    # read image
    result = reader.readtext(img_name, detail=0)[0]
    # result = reader.readtext('https://www.somewebsite.com/chinese_tra.jpg')
    if result:
        result = result. replace(' ', '')
    return result


def download_captcha(captcha_url):
    # Download the verification code image
    response = requests. get(captcha_url, stream=True)
    try:
        with open(r'./captcha.png', 'wb') as logFile:
            for chunk in response:
                logFile.write(chunk)
            logFile. close()
            print("Download done!")
    except Exception as e:
        print("Download log error!")


class AmazonspiderVerifyMiddleware:
    # Captcha
    @classmethod
    def from_crawler(cls, crawler):
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        returns

    def process_request(self, request, spider):

        return None

    def process_response(self, request, response, spider):
        # print(response. url)
        if 'Captcha' in response.text:
            headers = {<!-- -->
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
            }
            session = requests.session()
            resp = session. get(url=response. url, headers=headers)
            response1 = etree. HTML(resp. text)
            # Get the captcha image
            captcha_url = "".join(response1.xpath('//div[@class="a-row a-text-center"]/img/@src'))
            amzon = "".join(response1.xpath("//input[@name='amzn']/@value"))
            amz_tr = "".join(response1.xpath("//input[@name='amzn-r']/@value"))
            # Download the verification code image and save it
            download_captcha(captcha_url)
            # Recognize the captcha letters
            captcha_text = captcha_verfiy('captcha.png')
            # resend request
            url_new = f"https://www.amazon.com/errors/validateCaptcha?amzn={<!-- -->amzon} & amp;amzn-r={<!-- -->amz_tr} & amp; field-keywords={<!-- -->captcha_text}"
            resp = session. get(url=url_new, headers=headers)
            if "Sorry, we just need to make sure you're not a robot" not in str(resp.text):
                response2 = HtmlResponse(url=url_new, headers=headers, body=resp.text, encoding='utf-8')
                if "Sorry, we just need to make sure you're not a robot" not in str(response2.text):
                    return response2
            else:
                return request
        else:
            return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

Beginners to scrapy, write and record casually, and point out any problems in the writing. hey~~~