urllib + BeautifulSoup crawls and parses 2345 Weather King historical weather data
Website: Dongcheng Historical Weather Query_Historical Weather Forecast Query_2345 Weather Forecast
1. Code
import json import logging import urllib.parse from datetime import date, datetime from random import randint from time import sleep importpymysql from bs4 import BeautifulSoup #Define target URL import requests def weather_req(): month_list = [1,2,3,4,5,6] # month code_list = get_code() # Get all weather codes and region codes # Required January 2018 to June 2023 url = "https://tianqi.2345.com/Pc/GetHistory" # Original URL full_url = "" #Finally spelled url # Define request headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58', } #Define GET parameters params = { 'areaInfo[areaId]': 70809, 'areaInfo[areaType]': 2, 'date[year]': 2023, 'date[month]': 6 } # Traverse the list of weather codes and region codes for code_item in code_list: weather_code = code_item[0] # Get the weather code area_code = code_item[1] # Get the area code # Modify the value of url parameter weather code params['areaInfo[areaId]'] = weather_code # Start traversing the month list for month_item in month_list: print(f"Crawling the data of month [{month_item}] with weather ID [{weather_code}] and area ID [{area_code}]!") # Modify the value of month to the new value params['date[month]'] = month_item # Encode GET parameters encoded_params = urllib.parse.urlencode(params) # Splice the complete URL full_url = url + '?' + encoded_params print(full_url) try: sleep(randint(1, 3)) # sleep (randomly 1-3 seconds) # Initiate a request res = requests.get(full_url, headers=headers) res_data = json.loads(res.text) weather_data = res_data['data'] # print(weather_data) # Analytical data soup = BeautifulSoup(weather_data, 'html.parser') # Get the required table table_data = soup.find('table', attrs={'class': 'history-table'}) # print(type(table_data),'\\ ',table_data) all_tr = table_data.find_all('tr') # Get all tr # print(all_tr[0]) weather_list = [] # This is the list to store data # Start traversing the tr list. A list stores complete data for a certain month in a certain year in a certain region. for i in range(1, len(all_tr)): temp_list = [] # Temporarily store one day's data and refresh it every cycle tr_item = all_tr[i] # Get a tr data all_td = tr_item.find_all("td") # Get all td in a tr, the text in td is the required value rdate = str(all_td[0].text) # Date 2023-01-01 Sunday # The date needs to be converted to format, removing the day of the week rdate_new = rdate.split(" ")[0] # Get the date string # Parse string into date object date_object = datetime.strptime(rdate_new, "%Y-%m-%d") # Format the date object into a date string that MySQL can store mysql_date = date_object.strftime("%Y-%m-%d") # The final date stored wind_and_power = all_td[4].text # Wind direction and wind power are together and need to be parsed wind = str(wind_and_power).split("wind")[0] # Wind direction winp = str(wind_and_power).split("wind")[1] # Wind power temp_max = str(all_td[1].text) # Maximum temperature temp_min = str(all_td[2].text) # Minimum temperature weather = str(all_td[3].text) # Weather conditions # Store the above variables into temp_list and then store them together in weather_list temp_list.append(mysql_date) # Date temp_list.append(weather_code) # Weather code temp_list.append(area_code) #Area code temp_list.append(wind) # Wind direction temp_list.append(winp) # Wind power temp_list.append(temp_max) # Maximum temperature temp_list.append(temp_min) # Minimum temperature temp_list.append(weather) # Weather conditions weather_list.append(temp_list) print(weather_list) # Start inserting data [data from a certain region, a certain year, a certain month] conn_his,cursor_his = get_conn() # Establish a database connection # Traverse the data for save_item in weather_list: INSERT_SQL = "insert into w_weather_day_history (rdate,weather_code,area_code,wind,winp,temp_max,temp_min,weather) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s)" \ " "%(""" + save_item[0] + """, """ + save_item[1] + """, """ + save_item[2] + """, """ + save_item[3] + """ ,""" + save_item[4] + """ ,""" + save_item[5] + """ ,""" + save_item[6] + """ ,""" + save_item[7] + """) print(INSERT_SQL) cursor_his.execute(INSERT_SQL) #Execute sql statement conn_his.commit() # Submit transaction print("------------------------------------------------ ----") except urllib.error.URLError as e: print("An error occurred:", e) def get_code(): conn,cursor = get_conn() SQL = "select fwc.weather_code,fwc.area_code from f_weather_area_code fwc;" cursor.execute(SQL) res = cursor.fetchall() print(res) return res def get_conn(): """ :return: connection, cursor """ # Create connection conn = pymysql.connect(host="127.0.0.1", user="root", password="reliable", db="weather", charset="utf8") #Create cursor cursor = conn.cursor() # The result set returned after execution is displayed in tuples by default return conn, cursor def close_conn(conn, cursor): if cursor: cursor.close() if conn: conn.close() if __name__ == '__main__': #get_code() weather_req()
2. Analysis
The url is composed as follows:
Basic url: https://tianqi.2345.com/Pc/GetHistory
parameter:
params = { 'areaInfo[areaId]': 70809, 'areaInfo[areaType]': 2, 'date[year]': 2023, 'date[month]': 6 }
areaInfo[areaId] represents the weather code of a certain area, which needs to be obtained by yourself.
areaInfo[areaType] don’t care
The next two parameters are the year and month.
3. Initiate request demo
url = "https://tianqi.2345.com/Pc/GetHistory" # Original URL full_url = "" #Finally spelled url # Define request headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58', } #Define GET parameters params = { 'areaInfo[areaId]': 70809, 'areaInfo[areaType]': 2, 'date[year]': 2023, 'date[month]': 6 } # Parse parameters encoded_params = urllib.parse.urlencode(params) # Splice the complete URL full_url = url + '?' + encoded_params sleep(randint(1, 3)) # sleep (randomly 1-3 seconds) # Initiate a request res = requests.get(full_url, headers=headers) res_data = json.loads(res.text) weather_data = res_data['data']
4. Parsing data demo
# Parse data soup = BeautifulSoup(weather_data, 'html.parser') # Get the required table table_data = soup.find('table', attrs={'class': 'history-table'}) # print(type(table_data),'\\ ',table_data) all_tr = table_data.find_all('tr') # Get all tr
The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge. Python entry skill treeWeb crawlerBeautiful Soup383029 people are learning the system