Prediction of time-consuming cargo transportation based on XGBOOST model – Part 3 XGBOOST and LightGBM optimal selection model

  • When the amount of data is large, LightGBM will be significantly faster than XGBOOST, but the prediction accuracy of XGBOOST will be higher.
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

# read historical data
history_data = df_replen_2[['mot', 'country', 'priority', 'cc', 'climate_count','shippingfrequency','isgreen', 'holiday_count','forwarder', 'dayofweek','ts_pu_hour', ' lt_pu_pod']]

# Select the feature columns to use
feature_cols = ['mot', 'country', 'priority', 'cc', 'climate_count', 'shippingfrequency', 'isgreen', 'holiday_count', 'forwarder', 'dayofweek']

# Use one-hot encoding to process feature columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(history_data[feature_cols])
history_data_encoded = pd.DataFrame(encoder.transform(history_data[feature_cols]))

# Get the feature column name after one-hot encoding
feature_names = encoder. get_feature_names(feature_cols)

# Merge the one-hot encoded feature column with the transportation time-consuming column
history_data_encoded["ts_pu_hour"] = history_data["ts_pu_hour"]
history_data_encoded["lt_pu_pod"] = history_data["lt_pu_pod"]


# Split the training set and test set
train_data = history_data_encoded.sample(frac=0.8, random_state=123)
test_data = history_data_encoded.drop(train_data.index)

# train the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=20, reg_alpha=9, reg_lambda=5, gamma=0.6)
xgb_model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

# Evaluate on the test set
r2_score = xgb_model.score(test_data.iloc[:, :-1], test_data.iloc[:, -1])
print("R2 score on test data:", r2_score)

# read new data
new_data = df_replen_open[['mot', 'country', 'priority', 'cc', 'climate_count','shippingfrequency','isgreen', 'holiday_count','forwarder', 'dayofweek','ts_pu_hour']]

# One-hot encoding of new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data[feature_cols]))
new_data_encoded['ts_pu_hour'] = new_data['ts_pu_hour']

# use the model to make predictions
predictions = xgb_model. predict(new_data_encoded)

# output prediction result
print("Predictions:", predictions)
R2 score on test data: 0.9287763507620541
Predictions: [19.772514 7.7723885 6.275913 ... 13.262593 5.5251946 5.3950086]
new_data = df_replen_2[['mot', 'country', 'priority', 'cc', 'climate_count', 'shippingfrequency', 'isgreen', 'holiday_count', 'forwarder', 'dayofweek', 'ts_pu_hour ']]
new_data_encoded = pd.DataFrame(encoder.transform(new_data[feature_cols]))
new_data_encoded['ts_pu_hour'] = new_data["ts_pu_hour"]
predictions = xgb_model. predict(new_data_encoded)
df_replen_2['forecast_lt_pu_pod_xgb'] = pd.Series(predictions)
# # Define parameter grid
# param_grid = {<!-- -->
# 'n_estimators': [100, 200, 300],
# 'learning_rate': [0.1, 0.05, 0.01],
# 'max_depth': [5, 10, 15],
# 'reg_alpha': [0, 1, 5],
# 'reg_lambda': [0, 1, 5],
# 'gamma': [0.1, 0.5, 1]
# }

# # Create XGBoost regressor
# xgb_model = XGBRegressor()

# # Create grid search object
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='r2', cv=5)

# # Perform a grid search on the training set
# grid_search.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

# # Output the best parameter combination
# print("Best parameters:", grid_search. best_params_)

# # Output the R2 score of the best model
# best_model = grid_search. best_estimator_
# r2_score = best_model.score(X_test, y_test)
# print("R2 score of the best model:", r2_score)
df_replen_open['forecast_lt_pu_pod_xgb'] = pd.Series(predictions)
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# read historical data
history_data = df_replen_2[['mot', 'country', 'priority', 'cc', 'climate_count','shippingfrequency','isgreen', 'holiday_count','forwarder', 'dayofweek','ts_pu_hour', ' lt_pu_pod']]

# Select the feature columns to use
feature_cols = ['mot', 'country', 'priority', 'cc', 'climate_count', 'shippingfrequency', 'isgreen', 'holiday_count', 'forwarder', 'dayofweek', 'ts_pu_hour']

# Use one-hot encoding to process feature columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(history_data[feature_cols])
history_data_encoded = pd.DataFrame(encoder.transform(history_data[feature_cols]))

# Get the feature column name after one-hot encoding
feature_names = encoder. get_feature_names(feature_cols)

# Merge the one-hot encoded feature column with the transportation time-consuming column
history_data_encoded["ts_pu_hour"] = history_data["ts_pu_hour"]
history_data_encoded["lt_pu_pod"] = history_data["lt_pu_pod"]


# Split the training set and test set
train_data, test_data = train_test_split(history_data_encoded, test_size=0.2, random_state=123)

# Prepare the training set data
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

# Prepare test set data
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# Use the LightGBM model
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=22, reg_alpha=9, reg_lambda=5, gamma=0.6)

# fit the training set
lgb_model. fit(X_train, y_train)

# make predictions on the test set
y_pred = lgb_model. predict(X_test)

# Calculate the R2 score
r2_score_lgb = r2_score(y_test, y_pred)
print("R2 score using LightGBM:", r2_score_lgb)

# Use the model to make predictions on new data
new_data = df_replen_open[['mot', 'country', 'priority', 'cc', 'climate_count','shippingfrequency','isgreen', 'holiday_count','forwarder', 'dayofweek','ts_pu_hour']]
new_data_encoded = pd.DataFrame(encoder.transform(new_data[feature_cols]))
new_data_encoded['ts_pu_hour'] = new_data['ts_pu_hour']
predictions_lgb = lgb_model. predict(new_data_encoded)

# output prediction result
print("Predictions using LightGBM:", predictions_lgb)
[LightGBM] [Warning] Unknown parameter: gamma
R2 score using LightGBM: 0.8743557858040798
Predictions using LightGBM: [17.6757881 8.03650015 5.94436404 ... 11.29174144 5.99137554
  5.94436404]
new_data = df_replen_2[['mot', 'country', 'priority', 'cc', 'climate_count', 'shippingfrequency', 'isgreen', 'holiday_count', 'forwarder', 'dayofweek', 'ts_pu_hour ']]
new_data_encoded = pd.DataFrame(encoder.transform(new_data[feature_cols]))
new_data_encoded['ts_pu_hour'] = new_data['ts_pu_hour']
predictions_lgb = lgb_model. predict(new_data_encoded)
df_replen_2['forecast_lt_pu_pod_lgb'] = pd.Series(predictions_lgb)
df_replen_2['xgb_accuracy'] = df_replen_2['lt_pu_pod'] - df_replen_2['forecast_lt_pu_pod_xgb']
df_replen_2['lgb_accuracy'] = df_replen_2['lt_pu_pod'] - df_replen_2['forecast_lt_pu_pod_lgb']
# Get a list of unique values for different geographic locations (geo)
country_list = df_replen_2[df_replen_2['geo']=='LAS']['country'].unique()

# Iterate over each geographic location (geo)
for country in country_list:
    # Get lt_pu_pod data for a specific geographic location (geo)
    lt_pu_pod = df_replen_2[df_replen_2['country'] == country]['xgb_accuracy']
    
    # draw lt_pu_pod distribution
    plt.figure(figsize=(10, 6))
    plt.hist(lt_pu_pod, bins=30, density=True, alpha=0.7, color='b')

    # Mark the part with absolute value less than 1 as pink
    plt.hist(lt_pu_pod[np.abs(lt_pu_pod) < 1], bins=30, density=True, alpha=0.7, color='pink')

    # Mark the part whose absolute value is less than 2 and greater than or equal to 1 in red
    plt.hist(lt_pu_pod[(np.abs(lt_pu_pod) >= 1) & (np.abs(lt_pu_pod) < 2)], bins=30, density=True, alpha=0.7, color='red')
    
    # Set title and axis labels
    plt.title(f'xgb_accuracy Distribution (country: {<!-- -->country})')
    plt.xlabel('xgb_accuracy')
    plt.ylabel('Probability Density')
    
    # Calculate the proportion of parts less than 1
    ratio_lt_1 = len(lt_pu_pod[np.abs(lt_pu_pod) < 1]) / len(lt_pu_pod)

    # Calculate the proportion of parts less than 2
    ratio_lt_2 = len(lt_pu_pod[(np.abs(lt_pu_pod) >= 1) & (np.abs(lt_pu_pod) < 2)]) / len(lt_pu_pod)
    
    # Print the proportion of the part less than 1 and the proportion of the part less than 2
    print(f'Geographic location (country: {<!-- -->country}) is less than 1: {<!-- -->ratio_lt_1}')
    print(f'Geographic location (country: {<!-- -->country}) is less than 2: {<!-- -->ratio_lt_2 + ratio_lt_1}')
    
    # display graphics
    plt. show()
Proportion of geographical location (country: MX) less than 1: 0.5792474344355758
Proportion of geographical location (country: MX) less than 2: 0.7537058152793614

Proportion of geographical location (country: CL) less than 1: 0.5108045977011494
Proportion of geographical location (country: CL) less than 2: 0.7190804597701149

Proportion of geographical location (country: PE) less than 1: 0.5823324292909725
Proportion of geographical location (country: PE) less than 2: 0.8008523827973654

Proportion of geographical location (country: AR) less than 1: 0.45614035087719296
Proportion of geographical location (country: AR) less than 2: 0.7345029239766081

Proportion of geographical location (country: CO) less than 1: 0.5929791271347249
The proportion of geographical location (country: CO) less than 2: 0.8026565464895636

# Get a list of unique values for different geographic locations (geo)
geo_list = df_replen_2['geo'].unique()

# Iterate over each geographic location (geo)
for geo in geo_list:
    # Get lt_pu_pod data for a specific geographic location (geo)
    lt_pu_pod = df_replen_2[df_replen_2['geo'] == geo]['xgb_accuracy']
    
    # draw lt_pu_pod distribution
    plt.figure(figsize=(10, 6))
    plt.hist(lt_pu_pod, bins=30, density=True, alpha=0.7, color='b')

    # Mark the part with absolute value less than 1 as pink
    plt.hist(lt_pu_pod[np.abs(lt_pu_pod) < 1], bins=30, density=True, alpha=0.7, color='pink')

    # Mark the part whose absolute value is less than 2 and greater than or equal to 1 in red
    plt.hist(lt_pu_pod[(np.abs(lt_pu_pod) >= 1) & (np.abs(lt_pu_pod) < 2)], bins=30, density=True, alpha=0.7, color='red')
    
    # Set title and axis labels
    plt.title(f'xgb_accuracy Distribution (geo: {<!-- -->geo})')
    plt.xlabel('xgb_accuracy')
    plt.ylabel('Probability Density')
    
    # Calculate the proportion of parts less than 1
    ratio_lt_1 = len(lt_pu_pod[np.abs(lt_pu_pod) < 1]) / len(lt_pu_pod)

    # Calculate the proportion of parts less than 2
    ratio_lt_2 = len(lt_pu_pod[(np.abs(lt_pu_pod) >= 1) & (np.abs(lt_pu_pod) < 2)]) / len(lt_pu_pod)
    
    # Print the proportion of the part less than 1 and the proportion of the part less than 2
    print(f'Geographic location (geo: {<!-- -->geo}) is less than 1: {<!-- -->ratio_lt_1}')
    print(f'Geographic location (geo: {<!-- -->geo}) is less than 2: {<!-- -->ratio_lt_2 + ratio_lt_1}')
    
    # display graphics
    plt. show()
Proportion of geographical location (geo: LAS) less than 1: 0.5485282418456643
The proportion of the part whose geographic location (geo: LAS) is less than 2: 0.7645186953062848

Proportion of geographical location (geo: AP) less than 1: 0.7526357962213225
The proportion of the part whose geographic location (geo: AP) is less than 2: 0.8919323549257759

Proportion of geographical location (geo: EMEA) less than 1: 0.7375867592098239
Proportion of geographical location (geo: EMEA) less than 2: 0.9040754582665955

Proportion of geographical location (geo: NA) less than 1: 0.5368736150526766
The proportion of the part whose geographic location (geo: NA) is less than 2: 0.7902266454146095

# Get a list of unique values for different geographic locations (geo)
geo_list = df_replen_2['geo'].unique()

# Iterate over each geographic location (geo)
for geo in geo_list:
    # Get lt_pu_pod data for a specific geographic location (geo)
    lt_pu_pod = df_replen_2[df_replen_2['geo'] == geo]['lgb_accuracy']
    
    # draw lt_pu_pod distribution
    plt.figure(figsize=(10, 6))
    plt.hist(lt_pu_pod, bins=30, density=True, alpha=0.7, color='b')

    # Mark the part with absolute value less than 1 as pink
    plt.hist(lt_pu_pod[np.abs(lt_pu_pod) < 1], bins=30, density=True, alpha=0.7, color='pink')

    # Mark the part whose absolute value is less than 2 and greater than or equal to 1 in red
    plt.hist(lt_pu_pod[(np.abs(lt_pu_pod) >= 1) & (np.abs(lt_pu_pod) < 2)], bins=30, density=True, alpha=0.7, color='red')
    
    # Set title and axis labels
    plt. title(f'lgb_accuracy Distribution (geo: {<!-- -->geo})')
    plt.xlabel('lgb_accuracy')
    plt.ylabel('Probability Density')
    
    # Calculate the proportion of parts less than 1
    ratio_lt_1 = len(lt_pu_pod[np.abs(lt_pu_pod) < 1]) / len(lt_pu_pod)

    # Calculate the proportion of parts less than 2
    ratio_lt_2 = len(lt_pu_pod[(np.abs(lt_pu_pod) >= 1) & (np.abs(lt_pu_pod) < 2)]) / len(lt_pu_pod)
    
    # Print the proportion of the part less than 1 and the proportion of the part less than 2
    print(f'Geographic location (geo: {<!-- -->geo}) is less than 1: {<!-- -->ratio_lt_1}')
    print(f'Geographic location (geo: {<!-- -->geo}) is less than 2: {<!-- -->ratio_lt_2 + ratio_lt_1}')
    
    # display graphics
    plt. show()
Proportion of the part whose geographic location (geo: LAS) is less than 1: 0.22381331211880137
Proportion of geographical location (geo: LAS) less than 2: 0.45054362238133117

Proportion of geographical location (geo: AP) less than 1: 0.45124831309041835
The proportion of the part whose geographic location (geo: AP) is less than 2: 0.7465418353576248

Proportion of geographical location (geo: EMEA) less than 1: 0.5302100017796761
Proportion of geographical location (geo: EMEA) less than 2: 0.8165153941982559

Proportion of geographical location (geo: NA) less than 1: 0.39419974342028535
The proportion of the part whose geographic location (geo: NA) is less than 2: 0.647980406639972

Proportion of geographical location (geo: Brazil) less than 1: 0.0
Proportion of geographical location (geo: Brazil) less than 2: 1.0

# Draw a box plot according to geo grouping
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_replen_2, x='geo', y='xgb_accuracy')
plt.xlabel('Geo')
plt.ylabel('xgb_accuracy')
plt. title('Boxplot of xgb_accuracy by Geo')
plt. xticks(rotation=45)
plt.tight_layout()
plt. show()

group_mot_climate_pu_atd_lgb = df_replen_2.groupby(['geo','mot']).agg({<!-- -->'lgb_accuracy':['std','count','mean', lambda x: x.quantile(0.5), lambda x:x.quantile(0.9), lambda x:x.quantile(0.1)]}).reset_index()
group_mot_climate_pu_atd_lgb.columns = ['geo','mot','std','count','mean','quantile_5','quantile_9','quantile_1']
group_mot_climate_pu_atd_lgb
geo mot std count mean quantile_5 quantile_9 quantile_1
0 AP AIR 2.342541 37683 -0.040538 -0.456005 2.381934 -2.108590
1 AP SEA 2.965705 7366 0.021388 -0.426507 3.936324 -3.207305
2 AP TRUCK 0.922056 2375 -0.280426 -0.378388 0.641612 -1.111371
3 Brazil AIR NaN 1 1.869254 1.869254 1.869254 1.869254
group_mot_climate_pu_atd_xgb = df_replen_2.groupby(['geo','mot']).agg({<!-- -->'xgb_accuracy':['std','count','mean', lambda x: x.quantile(0.5), lambda x:x.quantile(0.9), lambda x:x.quantile(0.1)]}).reset_index()
group_mot_climate_pu_atd_xgb.columns = ['geo','mot','std','count','mean','quantile_5','quantile_9','quantile_1']
group_mot_climate_pu_atd_xgb
geo mot std count mean quantile_5 quantile_9 quantile_1
0 AP AIR 1.578557 37683 0.003911 -0.083548 1.231544 -1.290093
1 AP SEA 1.442503 7366 -0.001473 0.000532 1.020673 -1.118309
2 AP TRUCK 0.599171 2375 -0.006152 -0.085521 0.670907 -0.483689
3 Brazil AIR NaN 1 0.874765 0.874765 0.874765 0.874765
4 EMEA AIR 1.248103 21557 0.002630 -0.047782 1.242728 -1.269656
5 EMEA SEA 1.317035 919 0.073530 0.004353 0.562636 -0.495532