urilib: basic use &get&post

urllib_1_basic usage

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/23 20:54.
@Author: haifei
"""
import time
import urllib.request

if __name__ == '__main__':
    start = time. time()

    # 1. Define the access address
    url = 'http://www.baidu.com'
    # 2. Simulate the browser sending a request to the server
    response = urllib.request.urlopen(url)
    # 3. Get the page source code in the response
    content = response.read() # .read() returns binary data in bytes
    # print(content)
    # 4. Decoding: Binary -- "String
    content = content.decode('utf-8')
    print(content)

    print('It takes', time. time() - start, "seconds.")

urllib_2_one type and six methods

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/23 21:01.
@Author: haifei
"""
import time
from urllib import request

url = "http://irun2u.top"
response = request.urlopen(url)
# content = response. read(). decode("utf-8")
# print(content)

# a type
print(type(response)) # <class 'http.client.HTTPResponse'>

# Six methods: .read([n bytes]) .readline() .readlines() .getcode() .geturl() .getheaders()

# read is read by byte (note that multiple reads cannot be read at the same time)
# print(response.read(5)) # read 5 bytes
# print(response.readline()) # read a line
# print(response.readlines()) # Read line by line until it is finished

print(response.getcode()) # get status code: 200--"OK
print(response.geturl()) # Get the target address of the current visit: http://irun2u.top
print(response.getheaders()) # Get request header information

if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

urllib_3_download

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/23 21:16.
@Author: haifei
"""
import time
from urllib import request

# download webpage
url_page = 'http://irun2u.top'
request.urlretrieve(url_page, './download/irun2utop.html')

# download image
url_img = 'https://gimg2.baidu.com/image_search/src=http://safe-img.xhscdn.com/bw1/91239c50-d064-4ec1-b998-1e5f979c9c46?imageView2/2/w/1080/format /jpg & amp;refer=http://safe-img.xhscdn.com & amp;app=2002 & amp;size=f9999,10000 & amp;q=a80 & amp;n=0 & amp;g=0n &fmt=auto?sec=1682170811 &t=53fd80c95575efcc38e04269a4addf3f'
request.urlretrieve(url=url_img, filename='./download/lisa.jpg')

# download video
url_video = 'https://vd4.bdstatic.com/mda-kg0pcztgi0rucsza/v1-cae/sc/mda-kg0pcztgi0rucsza.mp4?v_from_s=hkapp-haokan-nanjing & amp;auth_key=1679580855-0-0-293c71bb38a72b92a3159ampda1 ;bcevod_channel=searchbox_feed &pd=1 &cd=0 &pt=3 &logid=2655222603 &vid=10392909521055706475 &abtest=107353_1 &klogid=2655222603'
request.urlretrieve(url_video, './download/lisa.mp4')


if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

urllib_4_ request object customization

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/23 21:52.
@Author: haifei
"""
import time
from urllib import request

# http 80
# https=http + ssl 443

url = 'https://www.baidu.com'
response = request.urlopen(url)
content = response. read(). decode('utf-8')
print(content) # The obtained content is far less than http://www.baidu.com, the reason is anti-crawling


'''
UA anti-climbing
UA introduction: User Agent is called User Agent in Chinese, or UA for short. It is a special string header that enables the server to identify the operating system used by the client.
and version, CPU type, browser and version. Browser kernel, browser rendering engine, browser language, browser plug-in, etc.
UA Daquan reference: https://blog.csdn.net/Uridis/article/details/86558811
'''
# request object customization
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
request2 = request.Request(url=url, headers=headers)
response2 = request.urlopen(request2)
content2 = response2. read(). decode('utf-8')
print(content2) # ok, the obtained content is the same as http://www.baidu.com


if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

quote method requested by urllib_5_get

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/23 22:13.
@Author: haifei
"""
import time
from urllib import request, parse


# https://www.baidu.com/s?ie=UTF-8 & amp;wd=Jay Chou
#url = 'https://www.baidu.com/s?ie=UTF-8 & amp;wd=Jay Chou' # Garbled characters are unicode encoding
name = parse.quote('Jay Chou') # Convert 'Jay Chou' to unicode encoding, same as above
print(name) # Jay Chou
url = 'https://www.baidu.com/s?ie=UTF-8 & amp;wd=' + name # Same effect as above url
print(url)



headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# Request object customization is a means to solve UA anti-climbing
request2 = request.Request(url=url, headers=headers)
response = request.urlopen(request2)
content = response. read(). decode('utf-8')
print(content)



if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

urlencode method of urllib_6_get request

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/23 22:59.
@Author: haifei
"""
import time
from urllib import parse, request


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

# urlencode application scenario: multiple parameters
# https://www.baidu.com/s?ie=UTF-8 &wd=Jay Chou &sex=male
# https://www.baidu.com/s?ie=UTF-8 &wd=Jay Chou &sex=male
url = 'https://www.baidu.com/s?ie=UTF-8 & amp;wd=' + parse.quote('Jay Chou') + ' & amp;sex=' + parse.quote('Male ')
print('url: ' + url)

data = {
    'wd': 'Jay Chou',
    'sex': 'male'
}
url2 = 'https://www.baidu.com/s?ie=UTF-8 &' + parse.urlencode(data)
print('url2: ' + url2) # same as above url


base_url = 'https://www.baidu.com/s?'
base_data = {
    'wd': 'Lisa',
    'sex': 'female',
    'location': 'South Korea'
}
new_data = parse.urlencode(base_data)
new_url = base_url + new_data
print(new_url)
request2 = request.Request(url=new_url, headers=headers) # TOD: request object custom anti-climbing
content = request.urlopen(request2).read().decode('utf-8')
print(content)


if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

urllib_7_post requests ordinary translation from Baidu Translate

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/24 23:02.
@Author: haifei
"""
import json
import time
from urllib import request, parse


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

# Google Chrome opens Baidu Translate F12 to find the interface sug (ordinary translation)
url = "https://fanyi.baidu.com/sug"

data = {
    'kw': 'spider'
}

# The parameters required by the post request must be encoded. urlencode(), and .encode('utf-8') must be called after encoding
data = parse.urlencode(data).encode('utf-8')
print(data) #kw=spider

# request object customization
# The parameters of the post request will not be spliced after the URL, but need to be placed in the parameters customized by the request object, and the next data parameter
_request = request.Request(url=url, data=data, headers=headers)
print(_request) # <urllib.request.Request object at 0x102ac6250>

# Simulate a browser sending a post request to the server
response = request.urlopen(_request)
print(response) # <http.client.HTTPResponse object at 0x10121a5b0>

# get response data
content = response. read(). decode("utf-8")
print(content)
print(type(content)) # <class 'str'>

# There are two ways to convert a string to a dictionary:
# 1. String to dictionary dic=eval(str)
dic_content = eval(content)
print(type(dic_content)) # <class 'dict'>
data = dic_content. get('data')[0]
print(data) # {'k': 'spider', 'v': 'n. spider; star wheel, cross; tripod with handle; tripod'}
print(type(data)) # <class 'dict'>
print(data.get('v')) # n. spider; star wheel, cross; tripod with handle; tripod

# 2. String to json object (also a dictionary type)
json_content = json. loads(content)
print(json_content) # =dic_content
print(type(json_content)) # <class 'dict'>

if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

urllib_8_post requests detailed translation from Baidu Translate

"""
                                       .-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
    | | | `'. | | | | | / /.-'_.' | `'. |
    |_| ' .'| '/| | | | / _.' ' .'| '/
           `-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/24 23:02.
@Author: haifei
"""
import json
import time
from urllib import request, parse

# F12 View the Request Headers in the Headers of this interface is the required request header information
# Note: You need to comment out the line Accept-Encoding, because there is no utf-8 encoding, an error will be reported
headers = {
    'Accept': '*/*',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Acs-Token': '1679672667898_1679672667576_SlMSHXMJiE5lO9O3mCbWXoLpxMKuOCuCrmVe6FIg/IKZBgeYHKHsmtqdpt/0wzm4lRYtqHhwdh5bF9qEEols1QlVyi8FUOJsMsWtaiq3LlPe4Bg3rUMLI26ka8WrCqkw4jVHdLC + W6gtaUPft3vRHGatTpVwSwiI1qNsvjl + N7fs0qf1mF//0C3ea6IoZ4/nE1uWLWTzqHkt0TIw/FJlHUt7oNn + 5fyrKP1nUBSKU00xpi + awI/Zsv7tlLLNyxrt0 + ePrjepVLzrK9kEHr9zNU2Cpqox3Kc88rMb61Vuc8 + YJWV4FVvyQZ1 + 6wQ7aPd + QuAx0RyEXTqU1YoVXFVKbeZviLGgI1POh9075YP89vo=',
    'Connection': 'keep-alive',
    'Content-Length': '116',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'BIDUPSID=B9B52F7273A4D0A02F4224DF0FE584E9; PSTM=1644560257; ZFY=WJR0yuV2wnPtrVSkigGW9zh6r:BS3wlaNLebcRmDOrT4:C; BAIDUID=131FA2C2E20EDCC5307B724B1B8D1609:FG=1; BAIDUID_BFESS=131FA2C2E20EDCC5307B724B1B8D1609:FG=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1679671060; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH =1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; ab_sr=1.0.1_NGRlOTg4M2IyMmZjMDhkNWQzYWQ2N2EzZmIxYzY3YzVhNTE4YTZmNGNjZTZiZTU4NTQ1ZThhYWNlNjU5Y2YyYWZmZDMyZTAwYjUxMzJjMWExMjVkYzQyZmU4MzVhN2JiZDVkNDBhMjEzYzJmNjZkMTJkODg4ZWNmNGY5YjNlMGRlMWM5NGU0NjE4ZDJiOTc2YTQzNDk5ZTBmYmI4NWU0NQ==; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1679672667',
    'Host': 'fanyi.baidu.com',
    'Origin': 'https://fanyi.baidu.com',
    'Referer': 'https://fanyi.baidu.com/',
    'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}
# In fact, all of the above request headers can be commented out except Cookie, Cookie is the key, anti-climbing

# Google Chrome opens Baidu Translate F12 to find the interface v2transapi (detailed translation)
url = "https://fanyi.baidu.com/v2transapi?from=en &to=zh"

# F12 View the Form Data in the Payload of this interface is the required parameter
data = {
    'from': 'en',
    'to': 'zh',
    'query': 'girl',
    'simple_means_flag': '3',
    'sign': '780982.985479',
    'token': '9d0251d64cfa1d98e5aab063d19cd487',
    'domain': 'common',
}

# The parameters required by the post request must be encoded + encode
data = parse.urlencode(data).encode('utf-8')

# request object customization
_request = request.Request(url=url, data=data, headers=headers)

# Simulate a browser sending a post request to the server
response = request.urlopen(_request)
content = response. read(). decode('utf-8')
print(content) #str
print(json. loads(content)) #dic


if __name__ == '__main__':
    start = time. time()
    print('It takes', time. time() - start, "seconds.")

https://www.bilibili.com/video/BV1Db4y1m7Ho