urllib_1_basic usage
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/23 20:54. @Author: haifei """ import time import urllib.request if __name__ == '__main__': start = time. time() # 1. Define the access address url = 'http://www.baidu.com' # 2. Simulate the browser sending a request to the server response = urllib.request.urlopen(url) # 3. Get the page source code in the response content = response.read() # .read() returns binary data in bytes # print(content) # 4. Decoding: Binary -- "String content = content.decode('utf-8') print(content) print('It takes', time. time() - start, "seconds.")
urllib_2_one type and six methods
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/23 21:01. @Author: haifei """ import time from urllib import request url = "http://irun2u.top" response = request.urlopen(url) # content = response. read(). decode("utf-8") # print(content) # a type print(type(response)) # <class 'http.client.HTTPResponse'> # Six methods: .read([n bytes]) .readline() .readlines() .getcode() .geturl() .getheaders() # read is read by byte (note that multiple reads cannot be read at the same time) # print(response.read(5)) # read 5 bytes # print(response.readline()) # read a line # print(response.readlines()) # Read line by line until it is finished print(response.getcode()) # get status code: 200--"OK print(response.geturl()) # Get the target address of the current visit: http://irun2u.top print(response.getheaders()) # Get request header information if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
urllib_3_download
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/23 21:16. @Author: haifei """ import time from urllib import request # download webpage url_page = 'http://irun2u.top' request.urlretrieve(url_page, './download/irun2utop.html') # download image url_img = 'https://gimg2.baidu.com/image_search/src=http://safe-img.xhscdn.com/bw1/91239c50-d064-4ec1-b998-1e5f979c9c46?imageView2/2/w/1080/format /jpg & amp;refer=http://safe-img.xhscdn.com & amp;app=2002 & amp;size=f9999,10000 & amp;q=a80 & amp;n=0 & amp;g=0n &fmt=auto?sec=1682170811 &t=53fd80c95575efcc38e04269a4addf3f' request.urlretrieve(url=url_img, filename='./download/lisa.jpg') # download video url_video = 'https://vd4.bdstatic.com/mda-kg0pcztgi0rucsza/v1-cae/sc/mda-kg0pcztgi0rucsza.mp4?v_from_s=hkapp-haokan-nanjing & amp;auth_key=1679580855-0-0-293c71bb38a72b92a3159ampda1 ;bcevod_channel=searchbox_feed &pd=1 &cd=0 &pt=3 &logid=2655222603 &vid=10392909521055706475 &abtest=107353_1 &klogid=2655222603' request.urlretrieve(url_video, './download/lisa.mp4') if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
urllib_4_ request object customization
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/23 21:52. @Author: haifei """ import time from urllib import request # http 80 # https=http + ssl 443 url = 'https://www.baidu.com' response = request.urlopen(url) content = response. read(). decode('utf-8') print(content) # The obtained content is far less than http://www.baidu.com, the reason is anti-crawling ''' UA anti-climbing UA introduction: User Agent is called User Agent in Chinese, or UA for short. It is a special string header that enables the server to identify the operating system used by the client. and version, CPU type, browser and version. Browser kernel, browser rendering engine, browser language, browser plug-in, etc. UA Daquan reference: https://blog.csdn.net/Uridis/article/details/86558811 ''' # request object customization headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } request2 = request.Request(url=url, headers=headers) response2 = request.urlopen(request2) content2 = response2. read(). decode('utf-8') print(content2) # ok, the obtained content is the same as http://www.baidu.com if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
quote method requested by urllib_5_get
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/23 22:13. @Author: haifei """ import time from urllib import request, parse # https://www.baidu.com/s?ie=UTF-8 & amp;wd=Jay Chou #url = 'https://www.baidu.com/s?ie=UTF-8 & amp;wd=Jay Chou' # Garbled characters are unicode encoding name = parse.quote('Jay Chou') # Convert 'Jay Chou' to unicode encoding, same as above print(name) # Jay Chou url = 'https://www.baidu.com/s?ie=UTF-8 & amp;wd=' + name # Same effect as above url print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } # Request object customization is a means to solve UA anti-climbing request2 = request.Request(url=url, headers=headers) response = request.urlopen(request2) content = response. read(). decode('utf-8') print(content) if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
urlencode method of urllib_6_get request
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/23 22:59. @Author: haifei """ import time from urllib import parse, request headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } # urlencode application scenario: multiple parameters # https://www.baidu.com/s?ie=UTF-8 &wd=Jay Chou &sex=male # https://www.baidu.com/s?ie=UTF-8 &wd=Jay Chou &sex=male url = 'https://www.baidu.com/s?ie=UTF-8 & amp;wd=' + parse.quote('Jay Chou') + ' & amp;sex=' + parse.quote('Male ') print('url: ' + url) data = { 'wd': 'Jay Chou', 'sex': 'male' } url2 = 'https://www.baidu.com/s?ie=UTF-8 &' + parse.urlencode(data) print('url2: ' + url2) # same as above url base_url = 'https://www.baidu.com/s?' base_data = { 'wd': 'Lisa', 'sex': 'female', 'location': 'South Korea' } new_data = parse.urlencode(base_data) new_url = base_url + new_data print(new_url) request2 = request.Request(url=new_url, headers=headers) # TOD: request object custom anti-climbing content = request.urlopen(request2).read().decode('utf-8') print(content) if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
urllib_7_post requests ordinary translation from Baidu Translate
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/24 23:02. @Author: haifei """ import json import time from urllib import request, parse headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' } # Google Chrome opens Baidu Translate F12 to find the interface sug (ordinary translation) url = "https://fanyi.baidu.com/sug" data = { 'kw': 'spider' } # The parameters required by the post request must be encoded. urlencode(), and .encode('utf-8') must be called after encoding data = parse.urlencode(data).encode('utf-8') print(data) #kw=spider # request object customization # The parameters of the post request will not be spliced after the URL, but need to be placed in the parameters customized by the request object, and the next data parameter _request = request.Request(url=url, data=data, headers=headers) print(_request) # <urllib.request.Request object at 0x102ac6250> # Simulate a browser sending a post request to the server response = request.urlopen(_request) print(response) # <http.client.HTTPResponse object at 0x10121a5b0> # get response data content = response. read(). decode("utf-8") print(content) print(type(content)) # <class 'str'> # There are two ways to convert a string to a dictionary: # 1. String to dictionary dic=eval(str) dic_content = eval(content) print(type(dic_content)) # <class 'dict'> data = dic_content. get('data')[0] print(data) # {'k': 'spider', 'v': 'n. spider; star wheel, cross; tripod with handle; tripod'} print(type(data)) # <class 'dict'> print(data.get('v')) # n. spider; star wheel, cross; tripod with handle; tripod # 2. String to json object (also a dictionary type) json_content = json. loads(content) print(json_content) # =dic_content print(type(json_content)) # <class 'dict'> if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
urllib_8_post requests detailed translation from Baidu Translate
""" .-''-. .--. _..._ .' .-. ) |__| .' '. / .' / / .--..-,.--. . .-. . (_/ / / | || .-. | | ' ' | / / | || | | | _ _ | | | | / / _ _ | || | | || ' / | | | | | . ' | ' / | | || | '-.' | .' | | | | | / / _.-').' | .' | |__|| | / | / | | | | | .' ' _.'.-'' / | / | | | | `'. | | | | | / /.-'_.' | `'. | |_| ' .'| '/| | | | / _.' ' .'| '/ `-' `--' '--' '--'( _.-' `-' `--' Created on 2023/3/24 23:02. @Author: haifei """ import json import time from urllib import request, parse # F12 View the Request Headers in the Headers of this interface is the required request header information # Note: You need to comment out the line Accept-Encoding, because there is no utf-8 encoding, an error will be reported headers = { 'Accept': '*/*', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Acs-Token': '1679672667898_1679672667576_SlMSHXMJiE5lO9O3mCbWXoLpxMKuOCuCrmVe6FIg/IKZBgeYHKHsmtqdpt/0wzm4lRYtqHhwdh5bF9qEEols1QlVyi8FUOJsMsWtaiq3LlPe4Bg3rUMLI26ka8WrCqkw4jVHdLC + W6gtaUPft3vRHGatTpVwSwiI1qNsvjl + N7fs0qf1mF//0C3ea6IoZ4/nE1uWLWTzqHkt0TIw/FJlHUt7oNn + 5fyrKP1nUBSKU00xpi + awI/Zsv7tlLLNyxrt0 + ePrjepVLzrK9kEHr9zNU2Cpqox3Kc88rMb61Vuc8 + YJWV4FVvyQZ1 + 6wQ7aPd + QuAx0RyEXTqU1YoVXFVKbeZviLGgI1POh9075YP89vo=', 'Connection': 'keep-alive', 'Content-Length': '116', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'BIDUPSID=B9B52F7273A4D0A02F4224DF0FE584E9; PSTM=1644560257; ZFY=WJR0yuV2wnPtrVSkigGW9zh6r:BS3wlaNLebcRmDOrT4:C; BAIDUID=131FA2C2E20EDCC5307B724B1B8D1609:FG=1; BAIDUID_BFESS=131FA2C2E20EDCC5307B724B1B8D1609:FG=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1679671060; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH =1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; ab_sr=1.0.1_NGRlOTg4M2IyMmZjMDhkNWQzYWQ2N2EzZmIxYzY3YzVhNTE4YTZmNGNjZTZiZTU4NTQ1ZThhYWNlNjU5Y2YyYWZmZDMyZTAwYjUxMzJjMWExMjVkYzQyZmU4MzVhN2JiZDVkNDBhMjEzYzJmNjZkMTJkODg4ZWNmNGY5YjNlMGRlMWM5NGU0NjE4ZDJiOTc2YTQzNDk5ZTBmYmI4NWU0NQ==; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1679672667', 'Host': 'fanyi.baidu.com', 'Origin': 'https://fanyi.baidu.com', 'Referer': 'https://fanyi.baidu.com/', 'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } # In fact, all of the above request headers can be commented out except Cookie, Cookie is the key, anti-climbing # Google Chrome opens Baidu Translate F12 to find the interface v2transapi (detailed translation) url = "https://fanyi.baidu.com/v2transapi?from=en &to=zh" # F12 View the Form Data in the Payload of this interface is the required parameter data = { 'from': 'en', 'to': 'zh', 'query': 'girl', 'simple_means_flag': '3', 'sign': '780982.985479', 'token': '9d0251d64cfa1d98e5aab063d19cd487', 'domain': 'common', } # The parameters required by the post request must be encoded + encode data = parse.urlencode(data).encode('utf-8') # request object customization _request = request.Request(url=url, data=data, headers=headers) # Simulate a browser sending a post request to the server response = request.urlopen(_request) content = response. read(). decode('utf-8') print(content) #str print(json. loads(content)) #dic if __name__ == '__main__': start = time. time() print('It takes', time. time() - start, "seconds.")
https://www.bilibili.com/video/BV1Db4y1m7Ho