SVR, MLP, adaboost, GBDT, XGBOOST, LIGHTGBM, random forest model parameter optimization + model training + shap explanation
- Import the required libraries and data processing
- Model hyperparameter optimization
- Split the training set and test set and perform shap interpretation
Import the required libraries and data processing
import numpy as np # Import the NumPy library for processing multi-dimensional arrays and matrix operations import pandas as pd # Import the Pandas library for data processing and analysis import matplotlib.pyplot as plt # Import the Matplotlib library for data visualization from sklearn.model_selection import train_test_split # Import the train_test_split function in the Scikit-learn library, which is used to divide the data set into a training set and a test set from sklearn.preprocessing import StandardScaler #Import the standardization function in the Scikit-learn library to standardize data. from sklearn.ensemble import RandomForestRegressor # Import the random forest regression model in the Scikit-learn library from sklearn.svm import SVR from sklearn.neural_network import MLPRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import GradientBoostingRegressor from lightgbm import LGBMRegressor from xgboost import XGBRegressor from sklearn import metrics from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score #Import the evaluation indicator function in the Scikit-learn library to evaluate the performance of the model import warnings #Import warnings library to control the display of warning messages warnings.filterwarnings('ignore') # Filter warning messages and do not display warnings from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_percentage_error #Import regression model evaluation indicators from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import KFold
Read data
data = pd.read_excel('data.xlsx') # Read the Excel file and return a DataFrame object
Handling missing value outliers
# Use isna() or isnull() to identify missing values missing_values = data.isna() # or df.isnull() # Count the number of missing values in each column missing_counts = missing_values.sum() print(missing_counts) data = data.fillna(0)#Missing value filling #Replace outliers with mean for i in data.columns: a = data.iloc[data[data[i] != -9999].index.tolist()][i].mean() data.loc[data[data[i] == -9999].index.tolist(), i] = a
Data preparation and normalization
X = data[['Y', 'GDP', 'GYQY', 'POPD', 'YJDG', 'RSEI', 'DEM', 'Aspect', 'MRVBF', 'TPI', 'SC', 'SM', 'GPP']] # Select the specified column X.columns = ['Y', 'GDP', 'GYQY', 'POPD', 'YJDG', 'RSEI', 'DEM', 'Aspect ', 'MRVBF', 'TPI', 'SC', 'SM', 'GPP'] # Rename columns X.describe() # Display descriptive statistical information of the data, including mean, standard deviation, minimum value, maximum value, etc. X_train = data.drop(['Y'], axis=1) # Remove the target variable from the training set and assign the remaining features to X_train y_train = np.array(data['Y'].copy()) # Assign the target variable PB to y_train #Normalized standarder = StandardScaler() # Create a StandardScaler object to standardize features X_train = standarder.fit_transform(X_train) y_standarder = StandardScaler() # Create a StandardScaler object to standardize the target variable y_train = np.expand_dims(y_train, axis=1) # Convert y_train to a two-dimensional array y_train = y_standarder.fit_transform(y_train)# Standardize the target variable of the training set y_train = y_train.squeeze(axis=1) # Restore y_train to a one-dimensional array
Model hyperparameter optimization
First define the object function
def objective(params): model = AdaBoostRegressor() if params['model_name'] == 'RF': del params['model_name'] model = RandomForestRegressor(**params, random_state=42) # Create a random forest elif params['model_name'] == 'SVR': del params['model_name'] # Delete the model_name parameter in params model = SVR(kernel = 'rbf', **params) # Create svr elif params['model_name'] == 'ADABOOST': del params['model_name'] # Delete the model_name parameter in params model = AdaBoostRegressor(**params) # Create elif params['model_name'] == 'MLP': del params['model_name'] # Delete the model_name parameter in params model = MLPRegressor( **params) # Create elif params['model_name'] == 'GBDT': del params['model_name'] # Delete the model_name parameter in params model = GradientBoostingRegressor(**params) # Create elif params['model_name'] == 'LIGHTGBM': del params['model_name'] # Delete the model_name parameter in params model = LGBMRegressor(**params) # Create a random forest elif params['model_name'] == 'XGBOOST': del params['model_name'] # Delete the model_name parameter in params model = XGBRegressor(**params) # Five fold cross validation folds = 5 mse_test = 0 mae_test = 0 r2_test = 0 kfold = KFold(n_splits=folds, shuffle=True, random_state=5421) for fold, (trn_idx, val_idx) in enumerate(kfold.split(X_train, y_train)): # print('-------fold {}-------'.format(fold + 1)) x_tra, y_trn, x_val, y_val = X_train[trn_idx], y_train[trn_idx], X_train[val_idx], y_train[val_idx] model.fit(x_tra,y_trn) test_pred = model.predict(x_val) # mse_test = (mse_test + mean_squared_error(y_true=y_val, y_pred=test_pred)) / (fold + 1) # mae_test = (mae_test + mean_absolute_error(y_true=y_val, y_pred=test_pred)) / (fold + 1) # r2_test = (r2_test + r2_score(y_true=y_val, y_pred=test_pred)) / (fold + 1) mse_test = mean_squared_error(y_true=y_val, y_pred=test_pred) mae_test = mean_absolute_error(y_true=y_val, y_pred=test_pred) r2_test = r2_score(y_true=y_val, y_pred=test_pred) # Save the calculation results to DataFrame and output result_df = pd.DataFrame({'RMSE': [mse_test**0.5], 'MAE': [mae_test], 'R2': [r2_test], }, index=['training set']) # print(result_df) return {'loss': mse_test, 'status': STATUS_OK} # Return the dictionary, where loss is the negative cross-validation score, and status is STATUS_OK, which means the optimization is successful.
Define optimizezim function
def optimalizim(objective, space, ): trials = Trials() #Create a Trials object to record parameters and results during the optimization process best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=1, trials=trials) #Call the fmin function to optimize and save the results in the Trials object params = space_eval(space, best) #Extract the best parameters from the hyperparameter space through the space_eval function print(f'Best parameters:{params}') model_name = params['model_name'] del params['model_name'] # Delete the model name because this parameter is no longer needed when building the model. return params, model_name
Define eva function
def eva(model, params ,is_train): # Use the found optimal parameters to build a model model = model.set_params(**params) model.fit(X_train, y_train) # predict train_pred = model.predict(X_train) if is_train == False: train_pre = y_standarder.inverse_transform(train_pred.reshape(-1, 1)) data['pred'] = train_pre data.to_excel(str(model.__class__.__name__) + 'Model prediction results.xlsx') result_plot(y_train, train_pred, model) mse_train = mean_squared_error(y_true=y_train, y_pred=train_pred) mae_train = mean_absolute_error(y_true=y_train, y_pred=train_pred) r2_train = r2_score(y_true=y_train, y_pred=train_pred) result_df = pd.DataFrame({'RMSE': [mse_train**0.5], 'MAE': [mae_train,], 'R2': [r2_train,], }, index=['Evaluation results']) print(result_df) return r2_train
Define parameter space and model selection function
clf1 = SVR() clf2 = MLPRegressor() clf3 = RandomForestRegressor() clf4 = AdaBoostRegressor(random_state=123) clf5 = GradientBoostingRegressor(criterion='friedman_mse') clf6 = LGBMRegressor() clf7 = XGBRegressor() # clf4 = AdaBoostRegressor(n_estimators=50, random_state=123,learning_rate=1.0) # clf5 = GradientBoostingRegressor(learning_rate=0.1,n_estimators=100,subsample=1.0,criterion='friedman_mse') #Define space dictionary space_RF = { 'model_name': hp.choice('model_name', ['RF']), # Randomly select the model name, 'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators 'max_depth': hp.choice('max_depth', range(1, 50)), 'min_samples_split': hp.choice('min_samples_split', range(2, 10)), 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)), 'max_features': hp.choice('max_features', range(1, X_train.shape[1])) } space_SVR = { 'model_name': hp.choice('model_name', ['SVR']), # Randomly select the model name, 'C': hp.choice('C', np.arange(0.1, 10, 0.1)), # 'gamma': hp.choice('gamma', np.arange(0.1, 50)), 'epsilon': hp.choice('epsilon', np.arange(0, 1, 0.1)), } space_ADABOOST = { 'model_name': hp.choice('model_name', ['AdaBoost']), # Randomly select the model name, 'n_estimators': hp.choice('n_estimators', np.arange(10, 300, 10)), # 'learning_rate': hp.choice('learning_rate', np.arange(0.1, 5, 0.1)), } space_MLP = { 'model_name': hp.choice('model_name', ['MLP']), # Randomly select the model name, 'batch_size': hp.choice('batch_size', np.arange(5, 30, 5)), # 'max_iter': hp.choice('max_iter', np.arange(50, 500, 50)), } space_GBDT = { 'model_name': hp.choice('model_name', ['GBDT']), # Randomly select the model name, 'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators 'max_depth': hp.choice('max_depth', range(1, 50)), 'min_samples_split': hp.choice('min_samples_split', range(2, 10)), 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)), 'learning_rate': hp.choice('learning_rate', np.arange(0.1, 2, 0.1)), 'subsample': hp.choice('subsample', np.arange(0.1, 1, 0.1)), } space_LIGHTGBM = { 'model_name': hp.choice('model_name', ['LIGHTGBM']), # Randomly select the model name, 'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators 'max_depth': hp.choice('max_depth', range(1, 50)), 'num_leaves': hp.choice('num_leaves', range(2, 10)), # 'device' : hp.choice('device', ['gpu']), # 'gpu_platform_id' : hp.choice('gpu_platform_id', ['0']), # 'gpu_device_id' : hp.choice('gpu_device_id', ['0']), } space_XGBOOST = { 'model_name': hp.choice('model_name', ['XGBOOST']), # Randomly select the model name, 'n_estimators': hp.choice('n_estimators', range(10, 500, 1)), # Randomly select n_estimators 'max_depth': hp.choice('max_depth', range(1, 50)), 'min_child_weight': hp.choice('min_child_weight', range(1, 10)), 'gamma': hp.choice('gamma', np.arange(0.2, 1,0.1)), # 'tree_method' : hp.choice('tree_method', ['gpu_hist']), # 'gpu_id' : hp.choice('gpu_id', [0]), } space = [space_SVR, space_MLP, space_RF, space_ADABOOST, space_GBDT, space_LIGHTGBM, space_XGBOOST] clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7 ] def model_select(clfs, space, is_train): model_best_name='' r2 = 0 for clf, space in zip(clfs, space): r2_train, model_best, _ = train(clf, space, is_train) if r2_train > r2: model_best_name = model_best r2 = r2_train print(f'The best model in this round is {model_best_name}') print(f'The r2 of the best model in this round is {r2}') return model_best_name,r2
Use majority voting to select the best model
results = [] for i in range(10): result, r2= model_select(clfs, space, True) results.append(result) results_order = pd.Series(results).value_counts().to_dict() best_model_last = list(results_order.keys())[0] print(f'The best model is {best_model_last}')
Split the training set and test set and perform shap interpretation
data_train, data_test = train_test_split(X, test_size=24, random_state=2) # Randomly divide the data set into a training set and a test set pd.merge(data['FID'],data_train,left_index=True,right_index=True,how='right').to_excel('./train_data.xlsx',index=None) pd.merge(data['FID'],data_test,left_index=True,right_index=True,how='right').to_excel('./test_data.xlsx',index=None) print(len(data_train)) # Output the number of samples in the training set print(len(data_test)) # Output the number of samples in the test set X_train_SHAP = data_train.drop(['Y'], axis=1) # Remove the target variable from the training set and assign the remaining features to X_train y_train_SHAP = np.array(data_train['Y'].copy()) # Assign the target variable PB to y_train features = X_train_SHAP.columns.to_list() # Get feature names X_test_SHAP = data_test.drop(['Y'], axis=1) # Remove the target variable from the test set and assign the remaining features to X_test y_test_SHAP = np.array(data_test['Y'].copy()) # Assign the target variable PB to y_test standarder = StandardScaler() # Create a StandardScaler object to standardize features X_train_SHAP = standarder.fit_transform(X_train_SHAP) # Standardize the training set features X_test_SHAP = pd.DataFrame(standarder.transform(X_test_SHAP), columns =features ) # Standardize the test set features y_standarder = StandardScaler() # Create a StandardScaler object to standardize the target variable y_train_SHAP = np.expand_dims(y_train_SHAP, axis=1) # Convert y_train to a two-dimensional array y_test_SHAP = np.expand_dims(y_test_SHAP, axis=1) # Convert y_test to a two-dimensional array y_train_SHAP = y_standarder.fit_transform(y_train_SHAP)# Standardize the target variable of the training set y_test_SHAP = y_standarder.transform(y_test_SHAP) # Standardize the target variable of the test set y_train_SHAP = y_train_SHAP.squeeze(axis=1) # Restore y_train to a one-dimensional array y_test_SHAP = y_test_SHAP.squeeze(axis=1) # Restore y_test to a one-dimensional array
Pass the optimal model and perform shap interpretation
import shape clf1 = SVR() clf2 = MLPRegressor() clf3 = RandomForestRegressor() clf4 = AdaBoostRegressor(random_state=123) clf5 = GradientBoostingRegressor(criterion='friedman_mse') clf6 = LGBMRegressor() clf7 = XGBRegressor() shap.initjs() if best_model_last == 'GBDT': _ , _ , best_params = train(GradientBoostingRegressor(), space_GBDT, True) model = GradientBoostingRegressor() #Set model parameters model.set_params(**best_params) # Use the best parameters you determined before model.fit(X_train_SHAP, y_train_SHAP) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test_SHAP) shap.summary_plot(shap_values, X_test_SHAP) shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP) shap.dependence_plot("GDP", shap_values, X_test_SHAP) elif best_model_last == 'rf': _ , _ , best_params = train(RandomForestRegressor(), space_RF, True) model = RandomForestRegressor() #Set model parameters model.set_params(**best_params) # Use the best parameters you determined before model.fit(X_train_SHAP, y_train_SHAP) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test_SHAP) shap.summary_plot(shap_values, X_test_SHAP) shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP) shap.dependence_plot("GDP", shap_values, X_test_SHAP) # Feature ph dependency graph elif best_model_last == 'LIGHTGBM': _ , _ , best_params = train(LGBMRegressor(), space_LIGHTGBM, True) model = LGBMRegressor() #Set model parameters model.set_params(**best_params) # Use the best parameters you determined before model.fit(X_train_SHAP, y_train_SHAP) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test_SHAP) shap.summary_plot(shap_values, X_test_SHAP) shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP) shap.dependence_plot("GDP", shap_values, X_test_SHAP) elif best_model_last == 'XGBOOST': _ , _ , best_params = train(XGBRegressor(), space_XGBOOST, True) model = XGBRegressor() #Set model parameters model.set_params(**best_params) # Use the best parameters you determined before model.fit(X_train_SHAP, y_train_SHAP) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test_SHAP) shap.summary_plot(shap_values, X_test_SHAP) shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP) shap.dependence_plot("GDP", shap_values, X_test_SHAP) elif best_model_last == 'MLP': _ , _ , best_params = train(MLPRegressor(), space_MLP, True) model = MLPRegressor() #Set model parameters model.set_params(**best_params) # Use the best parameters you determined before model.fit(X_train_SHAP, y_train_SHAP) X_train_summary = shap.kmeans(X_train_SHAP, 10) def predict_fn(X): return model.predict(X) explainer = shap.KernelExplainer(predict_fn, X_train_summary) shap_values = explainer.shap_values(X_test_SHAP) shap.summary_plot(shap_values, X_test_SHAP) shap.dependence_plot("GDP", shap_values, X_test_SHAP) # Feature ph dependency graph) elif best_model_last == 'SVR': _ , _ , best_params = train(SVR(), space_SVR, True) model = SVR() #Set model parameters model.set_params(**best_params) # Use the best parameters you determined before model.fit(X_train_SHAP, y_train_SHAP) X_train_summary = shap.kmeans(X_train_SHAP, 10) def predict_fn(X): return model.predict(X) explainer = shap.KernelExplainer(predict_fn, X_train_summary) shap_values = explainer.shap_values(X_test_SHAP) shap.summary_plot(shap_values, X_test_SHAP) shap.dependence_plot("GDP", shap_values, X_test_SHAP) # Feature ph dependency graph)
shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
Tip: The code is missing the train function. Please contact me to purchase the executable code.