Python case analysis: Classification prediction using LightGBM algorithm, random forest, and five-fold cross-validation

1. Data import

import pandas as pd
import numpy as np
import warnings
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
warnings. filter warnings('ignore')
pd.options.display.max_columns = None #Display all columns
pd.set_option('display.float_format', lambda x: '%.2f' % x) #Cancel scientific notation

train_data = pd.read_csv('Training.csv')
test_data = pd.read_csv('Test.csv')

2. Data cleaning

#Merge data
total_data = pd. concat([train_data, test_data])

# Replace . and ?
total_data = total_data.replace('.', np.nan)
total_data = total_data.replace('?', np.nan)

print(total_data[['area']].info())

According to the data dictionary, we can know a few points:
1), area is a character string, code 1 of the area code is the major classification, code 1 + 2 is the middle classification, and so on. So in theory it should be 3 digits, we need to convert the abnormal data into a null value, and extract the first three digits respectively;
2) The fields in the yellow part are related, such as ck-saveall = ck-savetime × ck-saveavg, so it can be filled by calculation;
3) The ck field is related to the field with the word ck behind it. In theory, as long as there is a value > 0 in the following field, ck is 1;
4) Other fields should also have internal connections, but because they don’t know the banking industry, they dare not move.
5), it is not recommended to replace the missing value with the mean value, because I will use the LightGBM algorithm later, so it can not be filled; if RandomForest is used, it is recommended to fill the missing value with -1

total_data['area'].where(total_data['area'].str.len()>=3, inplace=True)
print(total_data[['area']].info())

# Convert some object variables to numerical variables
total_data.rename(columns={<!-- -->'depsaveavg':'dep-saveavg', 'depdrawavg': 'dep-drawavg'}, inplace=True)
num_features = list(set(total_data.columns) - set(['ID', 'area', 'ck', 'comp', 'VV']))
for col in num_features:
    total_data[col] = pd.to_numeric(total_data[col])
total_data. info()

def cap(x, quantile=[0.05, 0.95]):
    """Block method to deal with outliers
    Args:
    x: pd.Series column, continuous variable
    quantile: Specifies the upper and lower quantile range of the capping method
    """
    # Generate quantiles
    Q05, Q95=x.quantile(quantile).values.tolist()
    # Replace outliers with specified quantiles
    if Q05 > x.min():
        x = x.copy()
        x.loc[x<Q05] = Q05
    if Q95 < x.max():
        x = x.copy()
        x.loc[x>Q95] = Q95
    return(x)

columns = total_data.columns.tolist()
# Cap the training set
total_data_1 = total_data[num_features]
total_data_2 = total_data_1.apply(cap)
new_total_data = pd.concat([total_data[['ID', 'area', 'ck', 'comp', 'VV']], total_data_2], axis=1)
total_data = new_total_data[columns]
total_data. head()

#According to the rules, supplement the data of 'ck' and 'dep' related fields
for way in ['ck-save', 'ck-draw', 'dep-save', 'dep-draw']:
    total_data['new_{}all'.format(way)] = total_data['{}time'.format(way)] * total_data['{}avg'.format(way)]
    total_data['new_{}time'.format(way)] = total_data['{}all'.format(way)] / total_data['{}avg'.format(way)]
    total_data['new_{}avg'.format(way)] = total_data['{}all'.format(way)] / total_data['{}time'.format(way)]
    total_data.loc[total_data['{}all'.format(way)].isnull(),'{}all'.format(way)] = total_data[total_data['{}all'. format(way)].isnull()]['new_{}all'.format(way)]
    total_data.loc[total_data['{}time'.format(way)].isnull(),'{}time'.format(way)] = total_data[total_data['{}time'. format(way)].isnull()]['new_{}time'.format(way)]
    total_data.loc[total_data['{}avg'.format(way)].isnull(),'{}avg'.format(way)] = total_data[total_data['{}avg'. format(way)].isnull()]['new_{}avg'.format(way)]
print(total_data. info())

#Supplement the data of ck
print(total_data['ck']. value_counts())

total_data.loc[(total_data['ck-saveall']>0)|(total_data['ck-drawall']>0)|(total_data['ck-drawtime']>0 )|(total_data['ck-saveavg']>0)
               |(total_data['ck-drawavg']>0)|(total_data['ck-savetime']>0)|(total_data['ck-changame']>0)|(total_data[\ 'ck-changtime']>0)
               |(total_data['ck-avg']>0), 'ck'] = '1'
print(total_data['ck']. value_counts())

3. Data integration and upsampling

Explanation: Because the scoring method is based on the F-Measure of small and medium-sized enterprises with capital needs to evaluate the quality of the prediction results, and the samples are extremely unbalanced, it is necessary to adjust the data distribution through upsampling. I used the smote method here ( In the exam, the time is too late to directly be a sample of 1 × 40)

#filter useful feature values
cate_features = ['area', 'ck', 'comp']
predictors = num_features + cate_features
all_columns = predictors + ['ID', 'VV']
total_data = total_data[all_columns]
total_data = total_data.fillna(-1)

for col in category_features:
    total_data[col] = pd.to_numeric(total_data[col])
    
new_train_data = total_data[total_data['VV'] != 'Withheld']
new_test_data = total_data[total_data['VV'] == 'Withheld']

# upsampling
smo = SMOTE(random_state=42)
new_train_data['VV'] = new_train_data['VV'].astype(int)
X_smo, y_smo = smo.fit_sample(new_train_data[predictors], new_train_data['VV'])
last_train_data = pd. concat([X_smo, y_smo], axis=1)
last_train_data. info()

last_train_data.head()

4. Modeling prediction

(1) LightGBM

#simple prediction
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import lightgbm as lgb

params = {<!-- -->'num_leaves': 30, #The result has a greater impact on the final effect, the larger the value, the better, if it is too large, there will be overfitting
          'min_data_in_leaf': 30,
          'objective': 'binary', #defined objective function
          'max_depth': -1,
          'learning_rate': 0.01,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.8, #Extracted feature ratio
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1, #l1 regular
          # 'lambda_l2': 0.001, #l2 regular
          "verbosity": -1,
          "nthread": -1, #The number of threads, -1 means all threads, the more threads, the faster the running speed
          'metric': {<!-- -->'binary_logloss'}, ##evaluation function selection
          "random_state": 2020, #Random number seed, which can prevent inconsistent results of each run
          # 'device': 'gpu' ##If you install the gpu version of lightgbm, you can speed up the calculation
          }

X_train, X_val, y_train, y_val = train_test_split(last_train_data[predictors], last_train_data["VV"],
                test_size=0.2, random_state=2020)
training_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=training_data)

evals_result = {<!-- -->} # used to record training results

model = lgb. train(params,
                  training_data,
                  num_boost_round=10000,
                  valid_sets = val_data,
                  early_stopping_rounds=100,
                  categorical_feature = cate_features,
                  evals_result = evals_result,
                  verbose_eval=500)
val_pred = model. predict(X_val)
val_pred = np.where(val_pred>=0.5, 1, 0)
val_true = y_val.as_matrix()
print(classification_report(val_true,val_pred))
test_pred = model. predict(new_test_data[predictors])
test_pred = np.where(test_pred>=0.5, 1, 0)
print(sum(test_pred))
print(len(test_pred))
answer = new_test_data. copy()
answer['VV'] = test_pred
answer[['ID', 'VV']].to_csv('results.csv', index=False)

(2) Random Forest

#Using RF for simple prediction
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

X_train, X_val, y_train, y_val = train_test_split(new_train_data[predictors], new_train_data["VV"],
                test_size=0.2, random_state=2020, stratify=new_train_data["VV"])

rf = RandomForestClassifier(n_estimators=50, min_samples_split=5, min_samples_leaf=3)
rf. fit(X_train, y_train)
print(accuracy_score(rf. predict(X_val), y_val))

pred = rf. predict(new_test_data[predictors])
new_test_data['VV'] = pred
new_test_data[['ID', 'VV']].to_csv('up_answer.csv', index=False)

print(sum(pred))
print(len(pred))

(3) LightGBM 5-fold cross

#5-fold cross-validation
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

train_x = last_train_data[predictors]
train_y = last_train_data['VV']
test_x = new_test_data[predictors]

X, y, X_test = train_x.values, train_y.values, test_x.values # convert to np.array type

def self_metric(labels, preds):
    preds = preds. get_label()
    pred = np.where(preds>=0.5, 1, 0)
    return f1_score(labels, preds)

param = {<!-- -->'num_leaves': 30, #The result has a greater impact on the final effect, the larger the value, the better, if it is too large, there will be overfitting
          'min_data_in_leaf': 30,
          'objective': 'binary', #defined objective function
          'max_depth': -1,
          'learning_rate': 0.01,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.8, #Extracted feature ratio
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1, #l1 regular
          # 'lambda_l2': 0.001, #l2 regular
          "verbosity": -1,
          "nthread": -1, #The number of threads, -1 means all threads, the more threads, the faster the running speed
          'metric': {<!-- -->'binary_logloss'}, ##evaluation function selection
          "random_state": 2020, #Random number seed, which can prevent inconsistent results of each run
          # 'device': 'gpu' ##If you install the gpu version of lightgbm, you can speed up the calculation
          }

# 50-fold cross-validation
folds = KFold(n_splits=5, shuffle=True, random_state=36)
folds

predictions = [] #predicted value of the test

for fold_, (train_index, test_index) in enumerate(folds. split(X, y)):
    print("{} cross-validation:".format(fold_ + 1))
    X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index]
    training_data = lgb.Dataset(X_train, label=y_train) # training data
    validation_data = lgb.Dataset(X_valid, label=y_valid) # validation data
    clf = lgb. train(param,
                    training_data,
                    num_boost_round=10000,
                    valid_sets=[validation_data],
                    verbose_eval=1000,
                    early_stopping_rounds=100,
# feval = self_metric
                    )
    x_pred = clf.predict(X_valid, num_iteration=clf.best_iteration)
    x_pred = np.where(x_pred>0.5,1,0)
    print(f1_score(y_valid, x_pred))
    y_test = clf.predict(X_test, num_iteration=clf.best_iteration) # prediction
# print(y_test[:10])
    predictions.append(y_test)

final_scoreList = []
for i in range(0, 6537):
    final_score = (predictions[0][i] + predictions[1][i] + predictions[2][i] + predictions[3][i] + predictions[4][i]) / 5
    final_scoreList.append(final_score)
# print(final_scoreList[:10])

pred1 = np.array(final_scoreList)
pred = np.where(pred1>=0.5, 1, 0)
print(sum(pred))
print(len(pred))

(4) LGBMClassifier 50% off cross

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import lightgbm as lgb
from scipy import stats

X, y, X_test = last_train_data[predictors].values, last_train_data['VV'], new_test_data[predictors].values # Convert to np.array type

folds = KFold(n_splits=5, shuffle=True, random_state=36)
predictions = [] #last predicted value

for k, (train_index, test_index) in enumerate(folds. split(X, y)):
    print("{}th cross-validation:".format(k + 1))
    X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index]
    clg = LGBMClassifier(
        boosting="gbdt",
        learning_rate=0.1,
        colsample_bytree=0.8,
# max_depth=5,
# n_estimators=100,
        num_leaves=31,
        lambda_l1=0.1,
        lambda_l2=0.1,
        seed=0
    )
    clg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=-1)
    train_pred = clg. predict(X_train)
    valid_pred = clg. predict(X_valid)
    print("Score of this round of training set: %.2f%%"%(f1_score(y_train,train_pred)*100))
    print("Validation set score of this round: %.2f%%"%(f1_score(y_valid,valid_pred)*100))
    pred = clg. predict(X_test)
    predictions.append(pred)
    
last_pred = stats. mode(predictions)[0][0]
new_test_data['VV'] = last_pred
new_test_data[['ID', 'VV']].to_csv('answer.csv', index=False)

print(sum(last_pred))
print(len(last_pred))