lightGBM example – feature screening and scorecard model construction

The data still uses the data in this example, and the specific background is the same as above.
Add model construction – use logistic regression to build models, lightGBM for feature screening
For an introduction to the lightGBM model, please see this link: Integrated Learning – Boosting Algorithm: Brief Principles and Differences of Adaboost, GBDT, XGBOOST and lightGBM
The specific code is as follows:
import module

# import module
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math

read data

df = pd.read_csv('Bcard.txt')
print(df. info())
df. head()


Divide training set and test set

# Divide test set and validation set
train = df[df.obs_mth!='2018-11-30'].reset_index().sort_values('obs_mth', ascending=False)
val = df[df.obs_mth == '2018-11-30'].reset_index()
train. head()


Group the data in the training set

# Divided into 5 groups in chronological order
train['rank'] = [i for i in range(train.shape[0])]
train['rank'] = pd. cut(train['rank'], bins=5, labels=[i for i in range(5)])
train['rank']. value_counts()


get feature

ft_lst = train.columns
ft_lst=ft_lst.drop(['index','rank','bad_ind','obs_mth','uid'])
ft_lst


define model function

# Define the lgb model function first
def lgb_test(train_X, train_y, test_X, test_y):
    from multiprocessing import cpu_count
    lgb_clf = lgb.LGBMClassifier(learning_rate=0.05,n_estimators=100)
    lgb_clf.fit(train_X, train_y, eval_set=[(train_X, train_y), (test_X, test_y)], eval_metric='auc', early_stopping_rounds=100)
    lgb.plot_metric(lgb_clf,metric='auc')
# print(lgb_clf.n_features_)
    return lgb_clf, lgb_clf.best_score_['valid_1']['auc']

Perform feature cross-filtering

# Use lightgbm for feature cross-filtering



feature_lst = []
ks_train_lst = []
ks_test_lst = []

#Filter the features according to the group
for rk in set(train['rank']):
    test_df = train[train['rank']==rk]
    train_df = train[train['rank']!=rk]
    
    train_X = train_df[ft_lst]
    train_y = train_df.bad_ind
    
    test_X = test_df[ft_lst]
    test_y = test_df.bad_ind
    
    model,auc = lgb_test(train_X,train_y,test_X,test_y)
    
    feature = pd.DataFrame({<!-- -->
        'name': model.booster_.feature_name(),
        'importance': model.feature_importances_
    }).set_index('name')
    feature_lst.append(feature)
    pred_y_train = model.predict_proba(train_X)[:,1]
    pred_y_test = model.predict_proba(test_X)[:,1]
    
    train_fpr, train_tpr,_ = roc_curve(train_y, pred_y_train)
    test_fpr, test_tpr, _ =roc_curve(test_y, pred_y_test)
    
    train_ks = abs(train_fpr-train_tpr).max()
    test_ks = abs(test_fpr-test_tpr).max()
    
    train_auc = metrics.auc(train_fpr, train_tpr)
    test_auc = metrics.auc(test_fpr, test_tpr)
    
    ks_train_lst.append(train_ks)
    ks_test_lst.append(test_ks)

calculate ks

print('train_ks', np. mean(ks_train_lst))
print('test_ks', np. mean(ks_test_lst))


Perform feature screening

# Take the average of 5 sets of eigenvalue combinations, and take the features greater than 20
feature_importance = pd.concat(feature_lst, axis=1).mean(axis=1)
lst = feature_importance[feature_importance>20].index.to_list()
lst


Use the model to build a scorecard

# Use lightgbm to build a scorecard
X = train[lst]
y = train.bad_ind

evl_X = val[lst]
evl_y = val.bad_ind

# Classification evaluation on the training set
model, auc = lgb_test(X,y, evl_X, evl_y)
y_pred = model.predict_proba(X)[:,1]
train_fpr, train_tpr,_ = roc_curve(y, y_pred)
train_ks = abs(train_fpr-train_tpr).max()
train_auc = metrics.auc(train_fpr, train_tpr)
print('train_ks', train_ks)

# Classification evaluation on the test set
y_pred = model.predict_proba(evl_X)[:,1]
test_fpr,test_tpr,_ = roc_curve(evl_y, y_pred)
test_ks = abs(test_fpr-test_tpr).max()
test_auc = metrics.auc(test_fpr, test_tpr)
print('test_ks', test_ks)


draw roc curve

# draw roc curve
plt.figure(figsize=(16,10))
plt.plot(train_fpr, train_tpr, color='blue', label='train lgb auc=%0.3f'%train_auc)
plt.plot(test_fpr, test_tpr, color='orange', label='test lgb auc=%0.3f'%test_auc)
plt.plot([0,1],[0,1],'--', color='black')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,1)
plt.ylim(0,1)
plt. title('ROC Curve')
plt. legend(loc=1)
plt. show()


Follow the scorecard formula to score and rank it

# Since lightgbm has no regression coefficient, we use the original scorecard formula
def score(p):
    score = 550 + 50*math.log2((1-p)/p)
    return score

val['p'] = model.predict_proba(evl_X)[:,1]
val['score'] = val.apply(lambda x:score(x.p), axis=1)
print(classification_report(evl_y, model. predict(evl_X), target_names=['good','bad']))

# sort by rating
def level(score):
    level = ''
    if score <= 600:
        level = "D"
    elif score <= 640 and score > 600 :
        level = "C"
    elif score <= 680 and score > 640:
        level = "B"
    elif score > 680 :
        level = "A"
    return level

val['level'] = val.apply(lambda x:level(x.score), axis=1)
val.level.value_counts()


verify ks

# verify ks
fpr,tpr,_ = roc_curve(evl_y, val['score'])
ks = abs(fpr-tpr).max()
print(ks)

0.42314255674351975

# generate report
temp = pd. DataFrame()
temp['bad_rate_pred'] = val['p']
temp['real_bad'] = evl_y
temp.sort_values('bad_rate_pred', inplace=True, ascending=False)
temp['num'] = [i for i in range(temp. shape[0])]
temp['num'] = pd.cut(temp.num, bins=20, labels=[i for i in range(20)])

report = pd. DataFrame()
report['bad'] = temp.groupby('num').real_bad.sum()
report['good'] = temp.groupby('num').real_bad.count()-report['bad']
report['bad_cnt'] = report['bad'].cumsum()
report['good_cnt'] = report['good'].cumsum()
good_total = report['good_cnt'].max()
bad_total = report['bad_cnt'].max()
report['bad_pct'] = round(report['bad_cnt']/bad_total,3)
report['good_pct'] = round(report['good_cnt']/good_total,3)
report['bad_rate'] = report.apply(lambda x:round(x.bad/(x.good + x.bad), 3), axis=1)
def cal_ks(x):
    ks = x.bad_pct - x.good_pct
    return round(math.fabs(ks),3)

report['ks'] = report.apply(cal_ks, axis=1)
report


Draw bad_rate and ks line chart

# Draw a line chart of bad_rate and KS
fig = plt.figure(figsize=(16,10))
ax = fig.add_subplot(111)
ax.plot(range(20), report['bad_rate'],'-o',label='bad_rate')
ax2 = ax. twinx()
ax2.plot(range(20), report['ks'],'--o',color='r',label='ks')
ax. grid()
ax.set_xlim(-1,20)
ax.set_ylim(0,0.14)
ax2.set_ylim(0,0.5)
ax.set_ylabel('bad_rate')
ax2.set_ylabel('ks')
ax. set_xlabel('num')
ax.legend(loc=2)
ax2. legend(loc=0)