urllib+BeautifulSoup crawls and parses 2345 Weather King historical weather data

urllib + BeautifulSoup crawls and parses 2345 Weather King historical weather data

Website: Dongcheng Historical Weather Query_Historical Weather Forecast Query_2345 Weather Forecast

image-20230702161423470

1. Code

import json
import logging
import urllib.parse
from datetime import date, datetime
from random import randint
from time import sleep

importpymysql
from bs4 import BeautifulSoup
#Define target URL
import requests

def weather_req():
    month_list = [1,2,3,4,5,6] # month
    code_list = get_code() # Get all weather codes and region codes
    # Required January 2018 to June 2023
    url = "https://tianqi.2345.com/Pc/GetHistory" # Original URL
    full_url = "" #Finally spelled url
    # Define request headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58',
    }
    #Define GET parameters
    params = {
        'areaInfo[areaId]': 70809,
        'areaInfo[areaType]': 2,
        'date[year]': 2023,
        'date[month]': 6
    }
    # Traverse the list of weather codes and region codes
    for code_item in code_list:
        weather_code = code_item[0] # Get the weather code
        area_code = code_item[1] # Get the area code
        # Modify the value of url parameter weather code
        params['areaInfo[areaId]'] = weather_code
        # Start traversing the month list
        for month_item in month_list:
            print(f"Crawling the data of month [{month_item}] with weather ID [{weather_code}] and area ID [{area_code}]!")
            # Modify the value of month to the new value
            params['date[month]'] = month_item
            # Encode GET parameters
            encoded_params = urllib.parse.urlencode(params)
            # Splice the complete URL
            full_url = url + '?' + encoded_params
            print(full_url)
            try:
                sleep(randint(1, 3)) # sleep (randomly 1-3 seconds)
                # Initiate a request
                res = requests.get(full_url, headers=headers)
                res_data = json.loads(res.text)
                weather_data = res_data['data']
                # print(weather_data)
                # Analytical data
                soup = BeautifulSoup(weather_data, 'html.parser')
                # Get the required table
                table_data = soup.find('table', attrs={'class': 'history-table'})
                # print(type(table_data),'\\
',table_data)
                all_tr = table_data.find_all('tr') # Get all tr
                # print(all_tr[0])
                weather_list = [] # This is the list to store data
                # Start traversing the tr list. A list stores complete data for a certain month in a certain year in a certain region.
                for i in range(1, len(all_tr)):
                    temp_list = [] # Temporarily store one day's data and refresh it every cycle
                    tr_item = all_tr[i] # Get a tr data
                    all_td = tr_item.find_all("td") # Get all td in a tr, the text in td is the required value
                    rdate = str(all_td[0].text) # Date 2023-01-01 Sunday
                    # The date needs to be converted to format, removing the day of the week
                    rdate_new = rdate.split(" ")[0] # Get the date string
                    # Parse string into date object
                    date_object = datetime.strptime(rdate_new, "%Y-%m-%d")
                    # Format the date object into a date string that MySQL can store
                    mysql_date = date_object.strftime("%Y-%m-%d") # The final date stored
                    wind_and_power = all_td[4].text # Wind direction and wind power are together and need to be parsed
                    wind = str(wind_and_power).split("wind")[0] # Wind direction
                    winp = str(wind_and_power).split("wind")[1] # Wind power
                    temp_max = str(all_td[1].text) # Maximum temperature
                    temp_min = str(all_td[2].text) # Minimum temperature
                    weather = str(all_td[3].text) # Weather conditions
                    # Store the above variables into temp_list and then store them together in weather_list
                    temp_list.append(mysql_date) # Date
                    temp_list.append(weather_code) # Weather code
                    temp_list.append(area_code) #Area code
                    temp_list.append(wind) # Wind direction
                    temp_list.append(winp) # Wind power
                    temp_list.append(temp_max) # Maximum temperature
                    temp_list.append(temp_min) # Minimum temperature
                    temp_list.append(weather) # Weather conditions
                    weather_list.append(temp_list)
                print(weather_list)
                # Start inserting data [data from a certain region, a certain year, a certain month]
                conn_his,cursor_his = get_conn() # Establish a database connection
                # Traverse the data
                for save_item in weather_list:
                    INSERT_SQL = "insert into w_weather_day_history (rdate,weather_code,area_code,wind,winp,temp_max,temp_min,weather) " \
                                 "values(%s,%s,%s,%s,%s,%s,%s,%s)" \
                                 " "%(""" + save_item[0] + """,
                                                  """ + save_item[1] + """,
                                                  """ + save_item[2] + """,
                                                  """ + save_item[3] + """
                                                 ,""" + save_item[4] + """
                                                 ,""" + save_item[5] + """
                                                 ,""" + save_item[6] + """
                                                 ,""" + save_item[7] + """)

                    print(INSERT_SQL)
                    cursor_his.execute(INSERT_SQL) #Execute sql statement
                    conn_his.commit() # Submit transaction
                    print("------------------------------------------------ ----")
            except urllib.error.URLError as e:
                print("An error occurred:", e)

def get_code():
    conn,cursor = get_conn()
    SQL = "select fwc.weather_code,fwc.area_code from f_weather_area_code fwc;"
    cursor.execute(SQL)
    res = cursor.fetchall()
    print(res)
    return res

def get_conn():
    """
    :return: connection, cursor
    """
    # Create connection
    conn = pymysql.connect(host="127.0.0.1",
                    user="root",
                    password="reliable",
                    db="weather",
                    charset="utf8")
    #Create cursor
    cursor = conn.cursor() # The result set returned after execution is displayed in tuples by default
    return conn, cursor

def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()

if __name__ == '__main__':
    #get_code()
    weather_req()

image-20230702161542526

2. Analysis

The url is composed as follows:

Basic url: https://tianqi.2345.com/Pc/GetHistory

parameter:

params = {
        'areaInfo[areaId]': 70809,
        'areaInfo[areaType]': 2,
        'date[year]': 2023,
        'date[month]': 6
    }

areaInfo[areaId] represents the weather code of a certain area, which needs to be obtained by yourself.

areaInfo[areaType] don’t care

The next two parameters are the year and month.

3. Initiate request demo

url = "https://tianqi.2345.com/Pc/GetHistory" # Original URL
    full_url = "" #Finally spelled url
    # Define request headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58',
    }
    #Define GET parameters
    params = {
        'areaInfo[areaId]': 70809,
        'areaInfo[areaType]': 2,
        'date[year]': 2023,
        'date[month]': 6
    }
    
    # Parse parameters
    encoded_params = urllib.parse.urlencode(params)
    # Splice the complete URL
    full_url = url + '?' + encoded_params
    sleep(randint(1, 3)) # sleep (randomly 1-3 seconds)
    # Initiate a request
    res = requests.get(full_url, headers=headers)
    res_data = json.loads(res.text)
    weather_data = res_data['data']

4. Parsing data demo

# Parse data
soup = BeautifulSoup(weather_data, 'html.parser')
# Get the required table
table_data = soup.find('table', attrs={'class': 'history-table'})
# print(type(table_data),'\\
',table_data)
all_tr = table_data.find_all('tr') # Get all tr

The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge. Python entry skill treeWeb crawlerBeautiful Soup383029 people are learning the system