SVR, adaboost, MLP, GBDT, XGBOOST, LIGHTGBM and random forest model parameter optimization + model training + shap explanation

SVR, MLP, adaboost, GBDT, XGBOOST, LIGHTGBM, random forest model parameter optimization + model training + shap explanation

  • Import the required libraries and data processing
  • Model hyperparameter optimization
  • Split the training set and test set and perform shap interpretation

Import the required libraries and data processing

import numpy as np # Import the NumPy library for processing multi-dimensional arrays and matrix operations
import pandas as pd # Import the Pandas library for data processing and analysis
import matplotlib.pyplot as plt # Import the Matplotlib library for data visualization
from sklearn.model_selection import train_test_split # Import the train_test_split function in the Scikit-learn library, which is used to divide the data set into a training set and a test set
from sklearn.preprocessing import StandardScaler #Import the standardization function in the Scikit-learn library to standardize data.
from sklearn.ensemble import RandomForestRegressor # Import the random forest regression model in the Scikit-learn library
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score #Import the evaluation indicator function in the Scikit-learn library to evaluate the performance of the model
import warnings #Import warnings library to control the display of warning messages
warnings.filterwarnings('ignore') # Filter warning messages and do not display warnings

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_percentage_error #Import regression model evaluation indicators
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

Read data

data = pd.read_excel('data.xlsx') # Read the Excel file and return a DataFrame object

Handling missing value outliers

# Use isna() or isnull() to identify missing values
missing_values = data.isna() # or df.isnull()

# Count the number of missing values in each column
missing_counts = missing_values.sum()
print(missing_counts)
data = data.fillna(0)#Missing value filling

#Replace outliers with mean
for i in data.columns:
    a = data.iloc[data[data[i] != -9999].index.tolist()][i].mean()
    data.loc[data[data[i] == -9999].index.tolist(), i] = a

Data preparation and normalization

X = data[['Y', 'GDP', 'GYQY', 'POPD', 'YJDG', 'RSEI', 'DEM', 'Aspect',
       'MRVBF', 'TPI', 'SC', 'SM', 'GPP']] # Select the specified column
X.columns = ['Y', 'GDP', 'GYQY', 'POPD', 'YJDG', 'RSEI', 'DEM', 'Aspect ',
       'MRVBF', 'TPI', 'SC', 'SM', 'GPP'] # Rename columns
X.describe() # Display descriptive statistical information of the data, including mean, standard deviation, minimum value, maximum value, etc.
X_train = data.drop(['Y'], axis=1) # Remove the target variable from the training set and assign the remaining features to X_train
y_train = np.array(data['Y'].copy()) # Assign the target variable PB to y_train

#Normalized
standarder = StandardScaler() # Create a StandardScaler object to standardize features
X_train = standarder.fit_transform(X_train)
y_standarder = StandardScaler() # Create a StandardScaler object to standardize the target variable
y_train = np.expand_dims(y_train, axis=1) # Convert y_train to a two-dimensional array
y_train = y_standarder.fit_transform(y_train)# Standardize the target variable of the training set
y_train = y_train.squeeze(axis=1) # Restore y_train to a one-dimensional array

Model hyperparameter optimization

First define the object function

def objective(params):
    
    model = AdaBoostRegressor()
    if params['model_name'] == 'RF':
        del params['model_name']
        model = RandomForestRegressor(**params, random_state=42) # Create a random forest
        
    elif params['model_name'] == 'SVR':
        del params['model_name'] # Delete the model_name parameter in params
        model = SVR(kernel = 'rbf', **params) # Create svr
        
    elif params['model_name'] == 'ADABOOST':
        del params['model_name'] # Delete the model_name parameter in params
        model = AdaBoostRegressor(**params) # Create
        
    elif params['model_name'] == 'MLP':
        del params['model_name'] # Delete the model_name parameter in params
        model = MLPRegressor( **params) # Create
        
    elif params['model_name'] == 'GBDT':
        del params['model_name'] # Delete the model_name parameter in params
        model = GradientBoostingRegressor(**params) # Create

    elif params['model_name'] == 'LIGHTGBM':
        del params['model_name'] # Delete the model_name parameter in params
        model = LGBMRegressor(**params) # Create a random forest
    elif params['model_name'] == 'XGBOOST':
        del params['model_name'] # Delete the model_name parameter in params
        model = XGBRegressor(**params)
# Five fold cross validation
    folds = 5
    mse_test = 0
    mae_test = 0
    r2_test = 0
    kfold = KFold(n_splits=folds, shuffle=True, random_state=5421)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
        # print('-------fold {}-------'.format(fold + 1))
        x_tra, y_trn, x_val, y_val = X_train[trn_idx], y_train[trn_idx], X_train[val_idx], y_train[val_idx]
        model.fit(x_tra,y_trn)
        test_pred = model.predict(x_val)

# mse_test = (mse_test + mean_squared_error(y_true=y_val, y_pred=test_pred)) / (fold + 1)
# mae_test = (mae_test + mean_absolute_error(y_true=y_val, y_pred=test_pred)) / (fold + 1)
# r2_test = (r2_test + r2_score(y_true=y_val, y_pred=test_pred)) / (fold + 1)

        mse_test = mean_squared_error(y_true=y_val, y_pred=test_pred)
        mae_test = mean_absolute_error(y_true=y_val, y_pred=test_pred)
        r2_test = r2_score(y_true=y_val, y_pred=test_pred)

        # Save the calculation results to DataFrame and output
    result_df = pd.DataFrame({'RMSE': [mse_test**0.5],
                          'MAE': [mae_test],
                          'R2': [r2_test],
                          }, index=['training set'])
# print(result_df)
    return {'loss': mse_test, 'status': STATUS_OK} # Return the dictionary, where loss is the negative cross-validation score, and status is STATUS_OK, which means the optimization is successful.

Define optimizezim function

def optimalizim(objective, space, ):
    trials = Trials() #Create a Trials object to record parameters and results during the optimization process
    best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1,
            trials=trials) #Call the fmin function to optimize and save the results in the Trials object
    params = space_eval(space, best) #Extract the best parameters from the hyperparameter space through the space_eval function
    print(f'Best parameters:{params}')
    model_name = params['model_name']
    del params['model_name'] # Delete the model name because this parameter is no longer needed when building the model.
    return params, model_name

Define eva function

def eva(model, params ,is_train):
# Use the found optimal parameters to build a model

    model = model.set_params(**params)
    model.fit(X_train, y_train)
    
    # predict
    train_pred = model.predict(X_train)
    if is_train == False:
        train_pre = y_standarder.inverse_transform(train_pred.reshape(-1, 1))
        data['pred'] = train_pre
        data.to_excel(str(model.__class__.__name__) + 'Model prediction results.xlsx')
        result_plot(y_train, train_pred, model)
        
    mse_train = mean_squared_error(y_true=y_train, y_pred=train_pred)
    mae_train = mean_absolute_error(y_true=y_train, y_pred=train_pred)
    r2_train = r2_score(y_true=y_train, y_pred=train_pred)

    result_df = pd.DataFrame({'RMSE': [mse_train**0.5],
                          'MAE': [mae_train,],
                          'R2': [r2_train,],
                          }, index=['Evaluation results'])
    print(result_df)
    return r2_train

Define parameter space and model selection function

 clf1 = SVR()
clf2 = MLPRegressor()
clf3 = RandomForestRegressor()
clf4 = AdaBoostRegressor(random_state=123)
clf5 = GradientBoostingRegressor(criterion='friedman_mse')
clf6 = LGBMRegressor()
clf7 = XGBRegressor()
# clf4 = AdaBoostRegressor(n_estimators=50, random_state=123,learning_rate=1.0)
# clf5 = GradientBoostingRegressor(learning_rate=0.1,n_estimators=100,subsample=1.0,criterion='friedman_mse')
#Define space dictionary
space_RF = {
    'model_name': hp.choice('model_name', ['RF']), # Randomly select the model name,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 10)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),
    'max_features': hp.choice('max_features', range(1, X_train.shape[1]))
}
space_SVR = {
    'model_name': hp.choice('model_name', ['SVR']), # Randomly select the model name,
    'C': hp.choice('C', np.arange(0.1, 10, 0.1)), #
    'gamma': hp.choice('gamma', np.arange(0.1, 50)),
    'epsilon': hp.choice('epsilon', np.arange(0, 1, 0.1)),
}

space_ADABOOST = {
    'model_name': hp.choice('model_name', ['AdaBoost']), # Randomly select the model name,
    'n_estimators': hp.choice('n_estimators', np.arange(10, 300, 10)), #
    'learning_rate': hp.choice('learning_rate', np.arange(0.1, 5, 0.1)),
}

space_MLP = {
    'model_name': hp.choice('model_name', ['MLP']), # Randomly select the model name,
    'batch_size': hp.choice('batch_size', np.arange(5, 30, 5)), #
    'max_iter': hp.choice('max_iter', np.arange(50, 500, 50)),
}
space_GBDT = {
    'model_name': hp.choice('model_name', ['GBDT']), # Randomly select the model name,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 10)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),
    'learning_rate': hp.choice('learning_rate', np.arange(0.1, 2, 0.1)),
    'subsample': hp.choice('subsample', np.arange(0.1, 1, 0.1)),
}

space_LIGHTGBM = {
    'model_name': hp.choice('model_name', ['LIGHTGBM']), # Randomly select the model name,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'num_leaves': hp.choice('num_leaves', range(2, 10)),
    # 'device' : hp.choice('device', ['gpu']),
    # 'gpu_platform_id' : hp.choice('gpu_platform_id', ['0']),
    # 'gpu_device_id' : hp.choice('gpu_device_id', ['0']),

    
    
}

space_XGBOOST = {
    'model_name': hp.choice('model_name', ['XGBOOST']), # Randomly select the model name,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_child_weight': hp.choice('min_child_weight', range(1, 10)),
    'gamma': hp.choice('gamma', np.arange(0.2, 1,0.1)),
    # 'tree_method' : hp.choice('tree_method', ['gpu_hist']),
    # 'gpu_id' : hp.choice('gpu_id', [0]),
}
space = [space_SVR, space_MLP, space_RF, space_ADABOOST, space_GBDT, space_LIGHTGBM, space_XGBOOST]
clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7 ]



def model_select(clfs, space, is_train):
    model_best_name=''
    r2 = 0
    for clf, space in zip(clfs, space):
        r2_train, model_best, _ = train(clf, space, is_train)
        if r2_train > r2:
            model_best_name = model_best
            r2 = r2_train
            
    print(f'The best model in this round is {model_best_name}')
    print(f'The r2 of the best model in this round is {r2}')
    return model_best_name,r2

Use majority voting to select the best model

results = []
for i in range(10):

    result, r2= model_select(clfs, space, True)
    results.append(result)
results_order = pd.Series(results).value_counts().to_dict()
best_model_last = list(results_order.keys())[0]
print(f'The best model is {best_model_last}')

Split the training set and test set and perform shap interpretation

data_train, data_test = train_test_split(X, test_size=24, random_state=2) # Randomly divide the data set into a training set and a test set
pd.merge(data['FID'],data_train,left_index=True,right_index=True,how='right').to_excel('./train_data.xlsx',index=None)
pd.merge(data['FID'],data_test,left_index=True,right_index=True,how='right').to_excel('./test_data.xlsx',index=None)
print(len(data_train)) # Output the number of samples in the training set
print(len(data_test)) # Output the number of samples in the test set

X_train_SHAP = data_train.drop(['Y'], axis=1) # Remove the target variable from the training set and assign the remaining features to X_train
y_train_SHAP = np.array(data_train['Y'].copy()) # Assign the target variable PB to y_train
features = X_train_SHAP.columns.to_list() # Get feature names

X_test_SHAP = data_test.drop(['Y'], axis=1) # Remove the target variable from the test set and assign the remaining features to X_test
y_test_SHAP = np.array(data_test['Y'].copy()) # Assign the target variable PB to y_test

standarder = StandardScaler() # Create a StandardScaler object to standardize features
X_train_SHAP = standarder.fit_transform(X_train_SHAP) # Standardize the training set features
X_test_SHAP = pd.DataFrame(standarder.transform(X_test_SHAP), columns =features ) # Standardize the test set features

y_standarder = StandardScaler() # Create a StandardScaler object to standardize the target variable
y_train_SHAP = np.expand_dims(y_train_SHAP, axis=1) # Convert y_train to a two-dimensional array
y_test_SHAP = np.expand_dims(y_test_SHAP, axis=1) # Convert y_test to a two-dimensional array
y_train_SHAP = y_standarder.fit_transform(y_train_SHAP)# Standardize the target variable of the training set
y_test_SHAP = y_standarder.transform(y_test_SHAP) # Standardize the target variable of the test set
y_train_SHAP = y_train_SHAP.squeeze(axis=1) # Restore y_train to a one-dimensional array
y_test_SHAP = y_test_SHAP.squeeze(axis=1) # Restore y_test to a one-dimensional array

Pass the optimal model and perform shap interpretation

import shape
clf1 = SVR()
clf2 = MLPRegressor()
clf3 = RandomForestRegressor()
clf4 = AdaBoostRegressor(random_state=123)
clf5 = GradientBoostingRegressor(criterion='friedman_mse')
clf6 = LGBMRegressor()
clf7 = XGBRegressor()
shap.initjs()
if best_model_last == 'GBDT':
    _ , _ , best_params = train(GradientBoostingRegressor(), space_GBDT, True)
    model = GradientBoostingRegressor()
    
    #Set model parameters
    model.set_params(**best_params) # Use the best parameters you determined before
    model.fit(X_train_SHAP, y_train_SHAP)
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP)

    
elif best_model_last == 'rf':
    _ , _ , best_params = train(RandomForestRegressor(), space_RF, True)
    model = RandomForestRegressor()
    
    #Set model parameters
    model.set_params(**best_params) # Use the best parameters you determined before
    model.fit(X_train_SHAP, y_train_SHAP)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP) # Feature ph dependency graph
    
elif best_model_last == 'LIGHTGBM':
    _ , _ , best_params = train(LGBMRegressor(), space_LIGHTGBM, True)
    model = LGBMRegressor()
    
    #Set model parameters
    model.set_params(**best_params) # Use the best parameters you determined before
    model.fit(X_train_SHAP, y_train_SHAP)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP)

elif best_model_last == 'XGBOOST':
    _ , _ , best_params = train(XGBRegressor(), space_XGBOOST, True)
    model = XGBRegressor()
    
    #Set model parameters
    model.set_params(**best_params) # Use the best parameters you determined before
    model.fit(X_train_SHAP, y_train_SHAP)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP)
    
    
elif best_model_last == 'MLP':
    _ , _ , best_params = train(MLPRegressor(), space_MLP, True)
    model = MLPRegressor()
    
    #Set model parameters
    model.set_params(**best_params) # Use the best parameters you determined before
    model.fit(X_train_SHAP, y_train_SHAP)
    X_train_summary = shap.kmeans(X_train_SHAP, 10)
    def predict_fn(X):
        return model.predict(X)

    explainer = shap.KernelExplainer(predict_fn, X_train_summary)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP) # Feature ph dependency graph)
    
elif best_model_last == 'SVR':
    _ , _ , best_params = train(SVR(), space_SVR, True)
    model = SVR()
    
    #Set model parameters
    model.set_params(**best_params) # Use the best parameters you determined before
    model.fit(X_train_SHAP, y_train_SHAP)
    X_train_summary = shap.kmeans(X_train_SHAP, 10)
    def predict_fn(X):
        return model.predict(X)

    explainer = shap.KernelExplainer(predict_fn, X_train_summary)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP) # Feature ph dependency graph)
    
shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)

Tip: The code is missing the train function. Please contact me to purchase the executable code.