Simple improvements using coroutines
Using gevent is very simple. You only need to put the task into the list of coroutines to be opened. The task will be switched according to the IO operation, and the speed will be significantly faster. It should be noted that monkey.patch_all() needs to be compiled before the request. Otherwise, an error will be reported. By observing the pictures, it can be found that they are not processed in the order of the playlist, but randomly. That is to say, for asynchronous IO, whoever finishes the IO operation first returns first.
#coding=utf-8 import gevent from gevent import monkey #Mark IO non-blocking, for the standard library monkey.patch_all()#This stuff has to be on the request import threading import time import csv import requests from urllib import request from io import BytesIO from PIL import Image from queue import Queue from threading import currentThread from bs4 import BeautifulSoup q = Queue() # Generate a queue for accessing ids, this way accessing ids does not require locks lock = threading.Lock() sig=1 # The consumer extracts the detailed information of the song list through the user id passed from the producer class def Consumer(): headers = {<!-- --> 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } #lock. acquire() with open("./music.csv","a",encoding="utf-8-sig") as file: write=csv. writer(file) ids = q. get() if (ids is None) or (len(ids)==0): print("queue is empty") print("%s end"%(currentThread().name)) sig=0 return for id in ids: url = 'https://music.163.com/' + id['href'] # The id link passed by the producer print("get url: %s"%(url)) response = requests. get(url=url, headers=headers) html = response.text soup = BeautifulSoup(html, 'html. parser') img = soup.select('img')[0]['data-src'] # image link title = soup.select('title')[0].get_text() # title idd = soup.select('.s-fc7')[0]['href'].split('=')[-1]#creator id nickname = soup.select('.s-fc7')[0].get_text() # nickname description = soup.select('p')[1].get_text() # Introduction count = soup.select('strong')[0].get_text() # Play times song_number = soup.select('span span')[0].get_text() # number of songs add_lis = soup.select('a i')[1].get_text() #Add to list times share = soup.select('a i')[2].get_text() # share times comment = soup.select('a i')[4].get_text() # number of comments write.writerow([title,idd,nickname,description,count,song_number,add_lis,share,comment]) #When writing a file, in order to prevent writing serial when scrambling, lock it, and release it after writing this part #The next step is to process the image res=requests.get(url=img,headers=headers) image = Image.open(BytesIO(res.content)) try: image.save("./pic/" + str(time.time()) + '.jpg') #Process error reporting OSError: cannot write mode RGBA as JPEG except: image.save("./pic/" + str(time.time()) + '.png') #lock. release() #The producer class generates a user id and passes the user id to the consumer def Producer(url): headers = {<!-- --> 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } response = requests. get(url=url, headers=headers) html = response.text soup = BeautifulSoup(html, 'html. parser') ids = soup.select('.dec a') # Get the label containing the URL of the song list details page q.put(ids)#put the id into the queue def main(): url=[] #Save the url address for i in range(0,1505,35): url_item=f'https://music.163.com/discover/playlist/?order=hot & amp;cat=Europe & amp;limit=35 & amp;offset={<!-- -->i} ' #网易云very insidiously added a strange character between com/ and discover#/, which caused me to be unable to crawl the content when I crawled before url.append(url_item) #Using the thread pool method #Adopt the method of coroutine gevent url_list=[gevent.spawn(Producer,url_i) for url_i in url]#Generate a task list gevent.joinall(url_list)#Use coroutines for IO operations and return results in order print("producer done") print("consumer begin") print(q. qsize()) #The format of the csv is fixed first with open("./music.csv","a",encoding="utf-8-sig") as csvfile: #It was not possible to use utf-8 before, but to use utf-8-sig writer = csv. writer(csvfile) writer.writerow(['song list title', 'creator id', 'creator nickname','introduction','play volume','number of songs',\ 'Number of times added to playlist','Number of shares','Number of comments']) #At the beginning, 10 coroutines were set up, resulting in only 350 playlists being crawled, which is exactly equal to 10*35, indicating that the operation after the loop was not executed url_list_C=[gevent.spawn(Consumer) for i in range(q.qsize())]# Generate a list of how many pages and coroutines to grab, similar to n threads, otherwise you need to do the task of loop crawling gevent.joinall(url_list_C)#Use coroutines for IO operations and return results in order print("mission finished") if __name__ == "__main__": main()
Feature improvements
1. Take the example of filtering more than 500 playbacks
It seems that yield is not used
count = soup.select('strong')[0].get_text() # Play times if int(count)<500: continue
2. According to the csv file information, filter the url
This function can use yield, follow the example, you can be a consumer and producer for url transmission and access
The following functions can be added. After judging an IO, pass it to the consumer function to let the function slowly write the IO to the kernel side.
# -*- coding: utf-8 -*- import csv #Read is an IO operation def producer(conn): #conn is incoming with open("D:\Economic and Management Junior\Modern Programming\week14\music.csv","r",encoding="utf-8") as f: print("Start filtering") conn. send(None) reader = csv.reader(f)# is equivalent to generating an iteration, and each read is an IO operation lis=next(reader)#Delete the name part of the first line first, and note that there are blank lines between each line of records while True: #We hope to read one first, then analyze and judge, skip the reading part try: lis=next(reader)#A read IO operation lis=next(reader)#Really read a data line, when the last blank line is read and executed again, an error will be reported if int(lis[4])>100000:#The playback volume is greater than 100000, then pass the id to the consumer conn.send(int(lis[1]))#Pass the value to the iterator to run except StopIteration: print("ending") conn. send("nothing") #Write is an IO operation def consumer(): with open("./filtered url.txt","a") as f: while True: print(f'start writing') id=yield 1 if isinstance(id, str): #Description has been screened break url='https://music.163.com/' + str(id) f.write(url + '\\ ') def main_1(): conn=consumer() producer(conn) if __name__ == '__main__': main_1() # main_2() pass
str(id)
f.write(url + ‘\\
‘)
def main_1():
conn=consumer()
producer(conn)
if name == ‘main‘:
main_1()
# main_2()
pass