Crawl today’s headlines pictures (bs4 method)

Crawling today’s headline pictures (bs4 method)

1. Preliminary crawling

Here we can easily figure out that the request method of this web page is a GET request. (Originally I planned to crawl beauties, but later I discovered that there would be image violations, so I changed it to crawling landscape images. To crawl beauties, you only need to change the URL)

For crawling images, I directly thought of using the bs4 parser to crawl from the beginning.

import requests
from bs4 import BeautifulSoup


class Baidu_photo_get(object):
    def __init__(self):
        self.number = 1
        self.url = "https://so.toutiao.com/search?keyword= scenery& amp;pd=atlas & amp;dvpf=pc & amp;aid=4916 & amp;page_num=0 & amp;search_json={ "from_search_id":"202311012132564BC1AED711837A44558F","origin_keyword":"Beauty","image_keyword":"Beauty"} & amp;source=input"
        self.headers = {<!-- -->
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76',
            'Cookie': 'msToken=inSIYQCMcgQBS_WUQq9eZGwZYXJ3aJH4oisRFRvTPXjWLetIE2Fgy0gygqV4YwjZyVchLEh6ublkb_9a1EB_ZTaFa52rRtlNlBJPIYc7; tt_webid=7295968954378421798; _ ga_QEHZPBE5HH=GS1.1.1698725159.1.0.1698725159.0.0.0; _ga=GA1.1.4784048.1698725160; ttwid=1|ZHZf2H84ODU8HGvNgE6R7ItycHQGup5OoD9-LskKIik|16987 25160|91e339d248bafc5260a492036ed670625bbd96ea3d17febdcb9ccc16dfb39bb1; __ac_nonce=065407d2c00b3483a557b; __ac_signature =_02B4Z6wo00f017yueGAAAIDA6Szz.5tt6XO8jnzAAIp01c; __ac_referer=https://www.toutiao.com/; _tea_utm_cache_4916=undefined; _S_WIN_WH=1488_742; _S_DPR=1.25; _S_IPAD=0; s_v_web _id=verify_lodt3rdk_jZ1cBKNS_em8n_4wCG_8hSi_foqXnR2uODsR'
        }

    def data_get_index(self):
        resp = requests.get(url=self.url, headers=self.headers)
        if resp.status_code == 200:
            return resp.text
        else:
            return None

    def parse_data_index(self, response):
        soup = BeautifulSoup(response, 'lxml')
        html = soup.find_all('img')
        for data in html:
            url = data.get('src')
            self.save_photo_data(url)

    def save_photo_data(self, url):
        file_data = f'{<!-- -->self.number} photo'
        with open('./photo/' + file_data + '.jpeg', 'wb') as f:
            img = requests.get(url).content
            f.write(img)
            print(f'{<!-- -->file_data} picture --saved!')
            self.number + = 1

    def run(self):
        response = self.data_get_index()
        # print(response)
        self.parse_data_index(response)


if __name__ == '__main__':
    Photo = Baidu_photo_get()
    Photo.run()

The results showed that only 40 images were crawled, but through practical analysis of web page images, we found that there were far more than 40 images. Therefore, we need to conduct a more in-depth analysis of the page.

Second, advanced crawling

We return to the page of the target website again, and then perform an analysis on the page, and find that each time the next set of photos appears, the page_num here will gradually increase.

Therefore, if you want to crawl all the pictures on the complete page, you need to get all the string parameters here and convert these parameters into url address. Only then can you get all the pictures.

Here, through preliminary crawling, we found that there are 40 pictures in a group of pictures. Therefore, here we simply crawl 4 groups of pictures, so we need to use this library here.

from urllib.parse import urlencode

It can convert the load parameters into the corresponding address of the corresponding subsequent pictures. This page_num needs to be constructed manually by ourselves. The advanced code is as follows:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode


class Toutiao_data_get(object):
    def __init__(self):
        self.number = 1
        self.url = "https://so.toutiao.com/search?keyword= scenery& amp;pd=atlas & amp;dvpf=pc & amp;aid=4916 & amp;page_num=0 & amp;search_json={ "from_search_id":"202311012132564BC1AED711837A44558F","origin_keyword":"Beauty","image_keyword":"Beauty"} & amp;source=input"
        self.headers = {<!-- -->
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76",
            "Cookie":"msToken=inSIYQCMcgQBS_WUQq9eZGwZYXJ3aJH4oisRFRvTPXjWLetIE2Fgy0gygqV4YwjZyVchLEh6ublkb_9a1EB_ZTaFa52rRtlNlBJPIYc7; tt_webid=7295968954378421798; _ ga=GA1.1.4784048.1698725160; _tea_utm_cache_4916=undefined; _S_DPR=1.25; _S_IPAD=0; s_v_web_id=verify_lodt3rdk_jZ1cBKNS_em8n_4wCG_8hSi_foqXnR2uODsR; ttwid=1|ZH Zf2H84ODU8HGvNgE6R7ItycHQGup5OoD9-LskKIik|1698845574|e105d7a136bdffc51f9cb61807cd7ff129a7e164783e088dc19e57e95b345b1f; _ga_QEHZPBE5HH =GS1.1.1698845574.2.0.1698845623.0.0.0; _S_WIN_WH=659_742; __ac_nonce=065425afa003b701c73a7; __ac_signature=_02B4Z6wo00f0196I-ogAAIDAiwpxFB5jZHPeqP4 AAJLpd0; __ac_referer=https://so.toutiao.com/search?keyword=美& amp;pd=atlas & amp ;dvpf=pc & amp;aid=4916 & amp;page_num=0 & amp;search_json={"from_search_id":"202311012132564BC1AED711837A44558F","origin_keyword":"Beauty","image_keyword":"Beauty"} & amp;source =input"
        }

    def url_data_get(self, num):
        parms = {<!-- -->
            'keyword': 'landscape',
            'pd': 'atlas',
            'dvpf': 'pc',
            'aid': '4916',
            'page_num': {<!-- -->},
            'search_json': '{"from_search_id":"202311012132564BC1AED711837A44558F","origin_keyword":"Beauty","image_keyword":"Beauty"}',
            'source': 'input',
        }
        url = "https://so.toutiao.com/search?" + urlencode(parms).format(num)

        return url


    def data_get_index(self, num):
        resp = requests.get(url=self.url_data_get(num), headers=self.headers)
        if resp.status_code == 200:
            return resp.text
        else:
            return None

    def parse_data_index(self, response):
        soup = BeautifulSoup(response, 'lxml')
        html = soup.find_all('img')
        # print(html)
        for data in html:
            url = data.get('src')
            self.save_data_get(url)

    def save_data_get(self, url):
        file_data = f'{<!-- -->self.number} photo'
        with open('./photo/' + file_data + '.jpeg', 'wb') as f:
            img = requests.get(url).content
            f.write(img)
            print(f'{<!-- -->file_data} picture--saved!')
            self.number + = 1

    def run(self):
        for num in range(1, 5):
            response = self.data_get_index(num)
            self.parse_data_index(response)


if __name__ == '__main__':
    photo = Toutiao_data_get()
    photo.run()

The crawled data is shown below:

Finally, the crawling of images can be correctly implemented.