Prediction of time-consuming cargo transportation based on XGBOOST model – Part 1 Acquisition of weather warning data

  • Construct models based on data obtained from the Internet
  • Background: 1. The transportation time will be affected by bad weather factors, such as heavy fog/thunderstorm/typhoon, etc. Regardless of whether it is air or sea transportation, bad weather may be encountered before and during takeoff, which may cause delays in takeoff, which will also affect our arrival time;
  • Background: 2. The transportation time will be affected by holiday factors, such as National Day/Chinese New Year/weekends, etc. If there is a long holiday, it may have a relatively large impact on our overall time-consuming. For example: the pick-up time is in the middle of the week, and the leadtime from the departure time may be relatively short. If it is Friday and weekend, it may take until Monday to take off. If it is the day before the National Day, it may take 7 days to take off.
  • So we need a data that can judge how many weekends and how many extreme weathers have been experienced between pick-up -> take-off for research.
  • Now we can start to get data from the Internet
import prestodb
import pandas as pd
import numpy as np
import math
import pymysql
import pymssql
from sqlalchemy import create_engine
import warnings
warnings. filter warnings("ignore")
from IPython.display import display
from urllib import parse
import datetime
import xlwings as xw
import matplotlib.pyplot as plt
import chinese_calendar
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import requests
from scipy.stats import f_oneway
from bs4 import BeautifulSoup
import re
import time
headers_lists =(
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
           'Chrome/65.0.3325.181 Safari/537.36',
    'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0',
    'Opera/9.80(Android2.3.4;Linux;Operamobi/adr-1107051709;U;zh-cn)Presto/2.8.149Version/11.10',
    'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
    'Mozilla/5.0(Android;Linuxarmv7l;rv:5.0)Gecko/Firefox/5.0fennec/5.0',
    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',)

Step 1: Get Climate Data

#crawl weather
#url = 'https://lishi.tianqi.com/wujiang/202201.html'
headers = {<!-- -->'User-Agent':random.choice(headers_lists),
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _ smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross={"distinct_id":"16ba37f7a942a6 -0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409","$device_id":"16ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409", "props":{"$latest_traffic_source_type":"direct traffic ","$latest_referrer":"","$latest_referrer_host":"","$latest_search_keyword":"No value found_open directly"}}; _ga= GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561 822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-|1561865554; CNZZDATA1254525948=879163647-1561815364-|1561869382; CNZ ZDATA1255633284=1986996647-1561812900-|1561866923; CNZZDATA1255604082=891570058-1561813905 -|1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=11 0000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid==='
            }

def set_link(year):
    The #year parameter is the year in which the data needs to be crawled
    link = []
    for i in range(1,13):
        # There are 12 months in a year
        if i < 10:
            url='https://lishi.tianqi.com/shanghai/{}0{}.html'.format(year,i)
        else:
            url='https://lishi.tianqi.com/shanghai/{}{}.html'.format(year,i)
        link.append(url)
    return link

def get_page(url, headers):
    html = requests. get(url, headers=headers)
    if html.status_code == 200:
        html.encoding = html.apparent_encoding
        return html.text
    else:
        return None

date_box = []
max_temp = []
min_temp = []
weh = []
wind = []
week_box = []

def get_data(years):
    link = set_link(years)
    for url in link:
        html = get_page(url, headers)
        bs = BeautifulSoup(html,'html. parser')

        data = bs.find_all(class_='thrui')
        date = re.compile('class="th200">(.*?)</')
        tem = re.compile('class="th140">(.*?)</')
        time = re.findall(date,str(data))
# print(time)
# print(len(time))
        for item in time:
            week = item[10:]
            week_box.append(week)
            date_box.append(item[:10])
        temp = re. findall(tem, str(data))
        for i in range(len(time)):
            #Before, all I needed was the weather information in June 2019, and I didn’t take into account the different number of days in each month. Now there is no problem after the modification
            max_temp.append(temp[i*4 + 0])
            min_temp.append(temp[i*4 + 1])
            weh. append(temp[i*4 + 2])
            wind. append(temp[i*4 + 3])
get_data(2022)
get_data(2023)
datas = pd.DataFrame({<!-- -->'date':date_box,'week':week_box,'maximum temperature':max_temp,'minimum temperature':min_temp,' weather':weh,'wind direction':wind})
df_climate = datas.drop_duplicates()
#crawl weather
#url = 'https://lishi.tianqi.com/wujiang/202201.html'
headers = {<!-- -->'User-Agent':random.choice(headers_lists),
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _ smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross={"distinct_id":"16ba37f7a942a6 -0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409","$device_id":"16ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409", "props":{"$latest_traffic_source_type":"direct traffic ","$latest_referrer":"","$latest_referrer_host":"","$latest_search_keyword":"No value found_open directly"}}; _ga= GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561 822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-|1561865554; CNZZDATA1254525948=879163647-1561815364-|1561869382; CNZ ZDATA1255633284=1986996647-1561812900-|1561866923; CNZZDATA1255604082=891570058-1561813905 -|1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=11 0000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid==='
            }

url='https://tianqi.2345.com/wea_history/58362.htm'
def get_page(url, headers):
    html = requests. get(url, headers=headers)
    if html.status_code == 200:
        html.encoding = html.apparent_encoding
        return html.text
    else:
        return None
html = get_page(url, headers)
bs = BeautifulSoup(html,'html. parser')
list_final = []
for i in bs.find("table",class_="history-table").find_all('tr'):
    if len(i.find_all('td')) >0:
        list_ = []
        for i_info in i.find_all('td'):
            if len(i_info.text.split(' ')) == 2:
                list_.append(i_info.text.split(' ')[0])
                list_.append(i_info.text.split(' ')[1].replace('week','week'))
            else:
                list_.append(i_info.text)
        list_final.append(list_)
df_climate_backup = pd.DataFrame(list_final, columns=['date','week','maximum temperature','minimum temperature','weather','wind direction',\ 'Air Quality Index','Air Quality Class'])
df_climate_backup = df_climate_backup[['date','week','maximum temperature','minimum temperature','weather','wind direction']]
for i_date in df_climate_backup['date']:
    if i_date not in df_climate['date'].tolist():
        df_climate = pd.concat([df_climate,df_climate_backup[df_climate_backup['date']==i_date]])
df_climate = df_climate.reset_index().drop('index',axis=1)
df_climate['city'] = 'Shanghai'
df_climate

Weather Warning

#crawl weather
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd

#url = 'https://lishi.tianqi.com/wujiang/202201.html'
headers = {<!-- -->'User-Agent':random.choice(headers_lists),
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _ smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross={"distinct_id":"16ba37f7a942a6 -0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409","$device_id":"16ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409", "props":{"$latest_traffic_source_type":"direct traffic ","$latest_referrer":"","$latest_referrer_host":"","$latest_search_keyword":"No value found_open directly"}}; _ga= GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561 822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-|1561865554; CNZZDATA1254525948=879163647-1561815364-|1561869382; CNZ ZDATA1255633284=1986996647-1561812900-|1561866923; CNZZDATA1255604082=891570058-1561813905 -|1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=11 0000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid==='
            }
def get_climate(x):
    url='https://www.tianqi.com/alarmnews_02/{0}/'.format(x)
    def get_page(url, headers):
        html = requests. get(url, headers=headers)
        if html.status_code == 200:
            html.encoding = html.apparent_encoding
            return html.text
        else:
            time. sleep(1)
            url='https://www.tianqi.com/alarmnews_02/{0}'.format(x)
            print('time sleep')
            html = requests. get(url, headers=headers)
            if html.status_code == 200:
                return html.text
            
    html = get_page(url, headers)
    bs = BeautifulSoup(html,'html. parser')
    return bs
list_alarm = []
list_timestamp = []
for i_x in range(1,50):
    try:
        bs = get_climate(i_x)
        for i_index in bs.find('ul',class_='leftlist').find_all('li'):
            if len(i_index.find('img').text)>0:
                alarm_ = i_index.find('img').text.split('release')[1]
                list_alarm.append(alarm_)
                list_timestamp.append(i_index.find('span').text.split(' ')[0])
            if len(i_index.find('img').text)==0:
                time_str = i_index.find('span').text
                alarm_ = i_index.find('a').text.split('release')[1].replace('\\
','').replace(time_str,'\ ')
                time_str = time_str. split(' ')[0]
                list_alarm.append(alarm_)
                list_timestamp.append(time_str)
    except:
        pass
df_climate_alarm = pd.DataFrame([list_alarm,list_timestamp]).T.drop_duplicates()
df_climate_alarm.columns = ['Alarm_info','date']
df_climate_alarm = df_climate_alarm.groupby('date').agg({<!-- -->'Alarm_info':lambda x:x.str.cat(sep='/')}).reset_index ()
df_climate_alarm
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df_climate_alarm['unic_version'] = cur_time
return_to_mysql(df_climate_alarm,'raw_climate_alarm_unic')

Step 2: Get Workalendar Data

from datetime import date
from workalendar.asia import China

# create a Chinese calendar object
cal = China()

# define date range
start_date = date(2021, 10, 1)
end_date = date(2023, 12, 31)

# Define an empty DataFrame
df = pd.DataFrame(columns=["date", "is_workday", "is_holiday", "holiday_name"])

# Traverse the date range and add holiday information to the DataFrame
for dt in pd.date_range(start=start_date, end=end_date):
    dt = dt. date()
    is_work = cal.is_working_day(dt)
    is_holi = not is_work and cal.is_holiday(dt)
    holi_name = cal.get_holiday_label(dt) if is_holi else ""
    df = df.append({<!-- -->"date": dt, "is_workday": is_work, "is_holiday": is_holi, "holiday_name": holi_name}, ignore_index=True )
# Define a function to convert a date to the day of the week
from datetime import datetime
def get_weekday(date_str):
    date_obj = datetime.strptime(str(date_str), "%Y-%m-%d")
    return date_obj. weekday()

# Use the apply method to apply the get_weekday function to each element in the date column to generate a new column weekday
df["weekday"] = df["date"].apply(get_weekday)

def is_holiday_(x):
    if str(x) == 'False':
        return 'holiday'
    if str(x) == 'True':
        return 'workday'
df["weekday_cat"] = df.apply(lambda x:is_holiday_(x['is_workday']),axis=1)
df_workday = df.drop_duplicates()
df_workday['date'] = pd.to_datetime(df_workday['date'])
import datetime
df_climate['date'] = pd.to_datetime(df_climate['date'])
df_workday['date'] = pd.to_datetime(df_workday['date'])
df_climate_alarm['date'] = pd.to_datetime(df_climate_alarm['date'])

df_climate_workday = pd.merge(df_climate,df_workday,left_on='date',right_on='date',how='left')
df_climate_workday = pd.merge(df_climate_workday,df_climate_alarm.rename(columns={<!-- -->'date':'date_alarm'}),left_on='date_alarm',right_on='date_alarm\ ',how='left')

Shanghai lockdown 22.3.18-22.4.27

index_Shutdown = df_climate_workday[(df_climate_workday['date']>=('2022-03-18')) & amp;(df_climate_workday['date']<('2022-04- 27'))].index
for i_index in index_Shutdown:
    df_climate_workday.loc[i_index,'weekday_cat']='lockdown day'
def get_key_alarm(x):
    if pd.isna(x)==False:
        if x.find('Thunder')>=0:
            return 'Abnormal climate'
        if x.find('fog')>=0:
            return 'Abnormal climate'
        if x.find('Thunder')>=0:
            return 'Abnormal climate'
        if x.find('Typhoon')>=0:
            return 'Abnormal climate'
        if x.find('storm')>=0:
            return 'Abnormal climate'
        else:
            return 'Normal climate'
df_climate_workday['Alarm_info_cat'] = df_climate_workday['Alarm_info'].map(lambda x:get_key_alarm(x))
df_climate_workday = df_climate_workday.drop('date',axis=1)
df_climate_workday = df_climate_workday.rename(columns={<!-- -->'date':'Date','week':'Week','maximum temperature':'maximum_temperature ','minimum temperature':'minimum_temperature','weather':'climate','wind direction':'wind_direction','city':'city\ '})
df_climate_workday = df_climate_workday[['Date', 'Week', 'maximum_temperature', 'minimum_temperature', 'climate',
       'wind_direction', 'city', 'is_workday', 'is_holiday', 'holiday_name',
       'weekday', 'weekday_cat', 'date_alarm', 'Alarm_info','Alarm_info_cat', 'unic_version'
       ]]
# df_climate_workday['unic_version'] = cur_time
# return_to_mysql(df_climate_workday,'raw_climate_workday_unic')
def Connect_MySQL_Get_data(sql):
#     ************** neglect
    return data
df_climate_workday = Connect_MySQL_Get_data(sql="""
""")
df_climate_alarm = Connect_MySQL_Get_data(sql="""
""")

Step 4: Get History Replenishment Data

df_replen = Get_df_source('2022-01-21','2023-05-15')

Step 5: Get Newly Replenishment Data

df_replen_open = Get_df_source(s="""
    """)