The data still uses the data in this example, and the specific background is the same as above.
Add model construction – use logistic regression to build models, lightGBM for feature screening
For an introduction to the lightGBM model, please see this link: Integrated Learning – Boosting Algorithm: Brief Principles and Differences of Adaboost, GBDT, XGBOOST and lightGBM
The specific code is as follows:
import module
# import module import pandas as pd import numpy as np import lightgbm as lgb from sklearn.metrics import roc_auc_score, roc_curve, classification_report from sklearn import metrics from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import seaborn as sns import math
read data
df = pd.read_csv('Bcard.txt') print(df. info()) df. head()
Divide training set and test set
# Divide test set and validation set train = df[df.obs_mth!='2018-11-30'].reset_index().sort_values('obs_mth', ascending=False) val = df[df.obs_mth == '2018-11-30'].reset_index() train. head()
Group the data in the training set
# Divided into 5 groups in chronological order train['rank'] = [i for i in range(train.shape[0])] train['rank'] = pd. cut(train['rank'], bins=5, labels=[i for i in range(5)]) train['rank']. value_counts()
get feature
ft_lst = train.columns ft_lst=ft_lst.drop(['index','rank','bad_ind','obs_mth','uid']) ft_lst
define model function
# Define the lgb model function first def lgb_test(train_X, train_y, test_X, test_y): from multiprocessing import cpu_count lgb_clf = lgb.LGBMClassifier(learning_rate=0.05,n_estimators=100) lgb_clf.fit(train_X, train_y, eval_set=[(train_X, train_y), (test_X, test_y)], eval_metric='auc', early_stopping_rounds=100) lgb.plot_metric(lgb_clf,metric='auc') # print(lgb_clf.n_features_) return lgb_clf, lgb_clf.best_score_['valid_1']['auc']
Perform feature cross-filtering
# Use lightgbm for feature cross-filtering feature_lst = [] ks_train_lst = [] ks_test_lst = [] #Filter the features according to the group for rk in set(train['rank']): test_df = train[train['rank']==rk] train_df = train[train['rank']!=rk] train_X = train_df[ft_lst] train_y = train_df.bad_ind test_X = test_df[ft_lst] test_y = test_df.bad_ind model,auc = lgb_test(train_X,train_y,test_X,test_y) feature = pd.DataFrame({<!-- --> 'name': model.booster_.feature_name(), 'importance': model.feature_importances_ }).set_index('name') feature_lst.append(feature) pred_y_train = model.predict_proba(train_X)[:,1] pred_y_test = model.predict_proba(test_X)[:,1] train_fpr, train_tpr,_ = roc_curve(train_y, pred_y_train) test_fpr, test_tpr, _ =roc_curve(test_y, pred_y_test) train_ks = abs(train_fpr-train_tpr).max() test_ks = abs(test_fpr-test_tpr).max() train_auc = metrics.auc(train_fpr, train_tpr) test_auc = metrics.auc(test_fpr, test_tpr) ks_train_lst.append(train_ks) ks_test_lst.append(test_ks)
calculate ks
print('train_ks', np. mean(ks_train_lst)) print('test_ks', np. mean(ks_test_lst))
Perform feature screening
# Take the average of 5 sets of eigenvalue combinations, and take the features greater than 20 feature_importance = pd.concat(feature_lst, axis=1).mean(axis=1) lst = feature_importance[feature_importance>20].index.to_list() lst
Use the model to build a scorecard
# Use lightgbm to build a scorecard X = train[lst] y = train.bad_ind evl_X = val[lst] evl_y = val.bad_ind # Classification evaluation on the training set model, auc = lgb_test(X,y, evl_X, evl_y) y_pred = model.predict_proba(X)[:,1] train_fpr, train_tpr,_ = roc_curve(y, y_pred) train_ks = abs(train_fpr-train_tpr).max() train_auc = metrics.auc(train_fpr, train_tpr) print('train_ks', train_ks) # Classification evaluation on the test set y_pred = model.predict_proba(evl_X)[:,1] test_fpr,test_tpr,_ = roc_curve(evl_y, y_pred) test_ks = abs(test_fpr-test_tpr).max() test_auc = metrics.auc(test_fpr, test_tpr) print('test_ks', test_ks)
draw roc curve
# draw roc curve plt.figure(figsize=(16,10)) plt.plot(train_fpr, train_tpr, color='blue', label='train lgb auc=%0.3f'%train_auc) plt.plot(test_fpr, test_tpr, color='orange', label='test lgb auc=%0.3f'%test_auc) plt.plot([0,1],[0,1],'--', color='black') plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.xlim(0,1) plt.ylim(0,1) plt. title('ROC Curve') plt. legend(loc=1) plt. show()
Follow the scorecard formula to score and rank it
# Since lightgbm has no regression coefficient, we use the original scorecard formula def score(p): score = 550 + 50*math.log2((1-p)/p) return score val['p'] = model.predict_proba(evl_X)[:,1] val['score'] = val.apply(lambda x:score(x.p), axis=1) print(classification_report(evl_y, model. predict(evl_X), target_names=['good','bad'])) # sort by rating def level(score): level = '' if score <= 600: level = "D" elif score <= 640 and score > 600 : level = "C" elif score <= 680 and score > 640: level = "B" elif score > 680 : level = "A" return level val['level'] = val.apply(lambda x:level(x.score), axis=1) val.level.value_counts()
verify ks
# verify ks fpr,tpr,_ = roc_curve(evl_y, val['score']) ks = abs(fpr-tpr).max() print(ks)
0.42314255674351975
# generate report temp = pd. DataFrame() temp['bad_rate_pred'] = val['p'] temp['real_bad'] = evl_y temp.sort_values('bad_rate_pred', inplace=True, ascending=False) temp['num'] = [i for i in range(temp. shape[0])] temp['num'] = pd.cut(temp.num, bins=20, labels=[i for i in range(20)]) report = pd. DataFrame() report['bad'] = temp.groupby('num').real_bad.sum() report['good'] = temp.groupby('num').real_bad.count()-report['bad'] report['bad_cnt'] = report['bad'].cumsum() report['good_cnt'] = report['good'].cumsum() good_total = report['good_cnt'].max() bad_total = report['bad_cnt'].max() report['bad_pct'] = round(report['bad_cnt']/bad_total,3) report['good_pct'] = round(report['good_cnt']/good_total,3) report['bad_rate'] = report.apply(lambda x:round(x.bad/(x.good + x.bad), 3), axis=1) def cal_ks(x): ks = x.bad_pct - x.good_pct return round(math.fabs(ks),3) report['ks'] = report.apply(cal_ks, axis=1) report
Draw bad_rate and ks line chart
# Draw a line chart of bad_rate and KS fig = plt.figure(figsize=(16,10)) ax = fig.add_subplot(111) ax.plot(range(20), report['bad_rate'],'-o',label='bad_rate') ax2 = ax. twinx() ax2.plot(range(20), report['ks'],'--o',color='r',label='ks') ax. grid() ax.set_xlim(-1,20) ax.set_ylim(0,0.14) ax2.set_ylim(0,0.5) ax.set_ylabel('bad_rate') ax2.set_ylabel('ks') ax. set_xlabel('num') ax.legend(loc=2) ax2. legend(loc=0)