gevent coroutine, improve IO efficiency

Simple improvements using coroutines

Using gevent is very simple. You only need to put the task into the list of coroutines to be opened. The task will be switched according to the IO operation, and the speed will be significantly faster. It should be noted that monkey.patch_all() needs to be compiled before the request. Otherwise, an error will be reported. By observing the pictures, it can be found that they are not processed in the order of the playlist, but randomly. That is to say, for asynchronous IO, whoever finishes the IO operation first returns first.

#coding=utf-8
import gevent
from gevent import monkey
#Mark IO non-blocking, for the standard library
monkey.patch_all()#This stuff has to be on the request
import threading
import time
import csv
import requests
from urllib import request
from io import BytesIO
from PIL import Image
from queue import Queue
from threading import currentThread
from bs4 import BeautifulSoup


q = Queue() # Generate a queue for accessing ids, this way accessing ids does not require locks
lock = threading.Lock()
sig=1

# The consumer extracts the detailed information of the song list through the user id passed from the producer class
def Consumer():
    headers = {<!-- -->
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    #lock. acquire()
    with open("./music.csv","a",encoding="utf-8-sig") as file:
        write=csv. writer(file)
        ids = q. get()
        if (ids is None) or (len(ids)==0):
            print("queue is empty")
            print("%s end"%(currentThread().name))
            sig=0
            return
        for id in ids:
            url = 'https://music.163.com/' + id['href'] # The id link passed by the producer
            print("get url: %s"%(url))
            response = requests. get(url=url, headers=headers)
            html = response.text
            soup = BeautifulSoup(html, 'html. parser')

            img = soup.select('img')[0]['data-src'] # image link
            title = soup.select('title')[0].get_text() # title
            idd = soup.select('.s-fc7')[0]['href'].split('=')[-1]#creator id
            nickname = soup.select('.s-fc7')[0].get_text() # nickname
            description = soup.select('p')[1].get_text() # Introduction
            count = soup.select('strong')[0].get_text() # Play times
            song_number = soup.select('span span')[0].get_text() # number of songs
            add_lis = soup.select('a i')[1].get_text() #Add to list times
            share = soup.select('a i')[2].get_text() # share times
            comment = soup.select('a i')[4].get_text() # number of comments
            write.writerow([title,idd,nickname,description,count,song_number,add_lis,share,comment])
            #When writing a file, in order to prevent writing serial when scrambling, lock it, and release it after writing this part

            #The next step is to process the image
            res=requests.get(url=img,headers=headers)
            image = Image.open(BytesIO(res.content))
            try:
                image.save("./pic/" + str(time.time()) + '.jpg')
            #Process error reporting OSError: cannot write mode RGBA as JPEG
            except:
                image.save("./pic/" + str(time.time()) + '.png')

    #lock. release()
#The producer class generates a user id and passes the user id to the consumer
def Producer(url):
        headers = {<!-- -->
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
        response = requests. get(url=url, headers=headers)
        html = response.text
        soup = BeautifulSoup(html, 'html. parser')
        ids = soup.select('.dec a') # Get the label containing the URL of the song list details page
        q.put(ids)#put the id into the queue

def main():

    url=[]
    #Save the url address
    for i in range(0,1505,35):
        url_item=f'https://music.163.com/discover/playlist/?order=hot & amp;cat=Europe & amp;limit=35 & amp;offset={<!-- -->i} '
        #网易云very insidiously added a strange character between com/ and discover#/, which caused me to be unable to crawl the content when I crawled before
        url.append(url_item)
    #Using the thread pool method
    #Adopt the method of coroutine gevent
    url_list=[gevent.spawn(Producer,url_i) for url_i in url]#Generate a task list
    gevent.joinall(url_list)#Use coroutines for IO operations and return results in order
    print("producer done")
    print("consumer begin")
    print(q. qsize())
    #The format of the csv is fixed first
    with open("./music.csv","a",encoding="utf-8-sig") as csvfile: #It was not possible to use utf-8 before, but to use utf-8-sig
        writer = csv. writer(csvfile)
        writer.writerow(['song list title', 'creator id', 'creator nickname','introduction','play volume','number of songs',\ 'Number of times added to playlist','Number of shares','Number of comments'])
    #At the beginning, 10 coroutines were set up, resulting in only 350 playlists being crawled, which is exactly equal to 10*35, indicating that the operation after the loop was not executed
    url_list_C=[gevent.spawn(Consumer) for i in range(q.qsize())]# Generate a list of how many pages and coroutines to grab, similar to n threads, otherwise you need to do the task of loop crawling
    gevent.joinall(url_list_C)#Use coroutines for IO operations and return results in order

    print("mission finished")


if __name__ == "__main__":
    main()

Feature improvements

1. Take the example of filtering more than 500 playbacks

It seems that yield is not used

count = soup.select('strong')[0].get_text() # Play times
if int(count)<500:
    continue

2. According to the csv file information, filter the url

This function can use yield, follow the example, you can be a consumer and producer for url transmission and access

The following functions can be added. After judging an IO, pass it to the consumer function to let the function slowly write the IO to the kernel side.

# -*- coding: utf-8 -*-

import csv


#Read is an IO operation
def producer(conn): #conn is incoming
    with open("D:\Economic and Management Junior\Modern Programming\week14\music.csv","r",encoding="utf-8") as f:
        print("Start filtering")
        conn. send(None)
        reader = csv.reader(f)# is equivalent to generating an iteration, and each read is an IO operation
        lis=next(reader)#Delete the name part of the first line first, and note that there are blank lines between each line of records
        while True: #We hope to read one first, then analyze and judge, skip the reading part
            try:
                lis=next(reader)#A read IO operation
                lis=next(reader)#Really read a data line, when the last blank line is read and executed again, an error will be reported
                if int(lis[4])>100000:#The playback volume is greater than 100000, then pass the id to the consumer
                    conn.send(int(lis[1]))#Pass the value to the iterator to run
            except StopIteration:
                print("ending")
                conn. send("nothing")

#Write is an IO operation
def consumer():
    with open("./filtered url.txt","a") as f:
        while True:
            print(f'start writing')
            id=yield 1
            if isinstance(id, str): #Description has been screened
                break
            url='https://music.163.com/' + str(id)
            f.write(url + '\\
')


def main_1():
    conn=consumer()
    producer(conn)


if __name__ == '__main__':
    main_1()
    # main_2()
    pass

str(id)
f.write(url + ‘\\
‘)

def main_1():
conn=consumer()
producer(conn)

if name == ‘main‘:
main_1()
# main_2()
pass