Article directory
- 1. Data set processing
- 2. Define the model
-
- training and drawing
- 3. Probability of good people/probability of bad people
- 4. Generate reports
- 5. Performance of behavioral scorecard model
- Summarize
1. Data set processing
import pandas as pd from sklearn.metrics import roc_auc_score,roc_curve,auc from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.linear_model import LogisticRegression import numpy as np import random import math import time import lightgbm as lgb
data = pd.read_csv('Bcard.txt') data.head()
#Look at the monthly distribution, we use the last month as the cross-time verification set data.obs_mth.unique()
df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy() val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
#These are all our variables. The end of info is the personal performance output by the unsupervised system we made, and the end of score is the paid external credit data. lst = ['person_info','finance_info','credit_info','act_info','td_score','jxl_score','mj_score','rh_score']
df_train = df_train.sort_values(by = 'obs_mth', ascending = False) rank_lst = [] for i in range(1,len(df_train) + 1): rank_lst.append(i) df_train['rank'] = rank_lst df_train['rank'] = df_train['rank']/len(df_train) pct_lst = [] for x in df_train['rank']: if x <= 0.2: x = 1 elif x <= 0.4: x = 2 elif x <= 0.6: x = 3 elif x <= 0.8: x = 4 else: x=5 pct_lst.append(x) df_train['rank'] = pct_lst #train = train.drop('obs_mth',axis = 1) df_train.head()
1. Use the sort_values() function to sort df_train in descending order according to the ‘obs_mth’ column. This means that observations with newer months will be ranked first.
2. Create a list named rank_lst containing all integers from 1 to len(df_train). This is done to subsequently assign a rank to each row of the DataFrame.
3. Using list comprehensions, assign each element in rank_lst to the ‘rank’ column of df_train. In this way, the ‘rank’ column of each row represents the ranking of the row in the sorted DataFrame.
4. Divide the ‘rank’ column of df_train by len(df_train) to convert it into a percentage representation. In this way, the ‘rank’ column of each row represents the ranking percentage of the row in the sorted DataFrame.
5. Create a new list pct_lst containing the converted percentages.
6. Use list comprehensions to assign each element in pct_lst to the ‘rank’ column of df_train based on the value of the ‘rank’ column of df_train. In this way, the ‘rank’ column of each row represents the ranking percentage of the row in the sorted DataFrame.
7. Finally, delete the ‘obs_mth’ column in df_train as this is no longer needed.
df_train['rank'].groupby(df_train['rank']).count()
2. Define the model
#Define lgb function def LGB_test(train_x,train_y,test_x,test_y): from multiprocessing import cpu_count clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800,max_features = 140, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1, num_iterations = 800 #Number of iterations ) clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc') print(clf.n_features_) return clf,clf.best_score_[ 'valid_1']['auc'] feature_lst = {<!-- -->} ks_train_lst = [] ks_test_lst = [] for rk in set(df_train['rank']): # The test set will be used as a cross-time validation set after 8.18 #Define model training set and test set ttest = df_train[df_train['rank'] == rk] ttrain = df_train[df_train['rank'] != rk] train = ttrain[lst] train_y = ttrain.bad_ind test = ttest[lst] test_y = ttest.bad_ind start = time.time() model,auc = LGB_test(train,train_y,test,test_y) end = time.time() #Model contribution is placed in feature feature = pd.DataFrame( {<!-- -->'name' : model.booster_.feature_name(), 'importance' : model.feature_importances_ }).sort_values(by = ['importance'], ascending = False) #Calculate KS and AUC on the training set, test set, and validation set y_pred_train_lgb = model.predict_proba(train)[:, 1] y_pred_test_lgb = model.predict_proba(test)[:, 1] train_fpr_lgb, train_tpr_lgb, _ = roc_curve(train_y, y_pred_train_lgb) test_fpr_lgb, test_tpr_lgb, _ = roc_curve(test_y, y_pred_test_lgb) train_ks = abs(train_fpr_lgb - train_tpr_lgb).max() test_ks = abs(test_fpr_lgb - test_tpr_lgb).max() train_auc = metrics.auc(train_fpr_lgb, train_tpr_lgb) test_auc = metrics.auc(test_fpr_lgb, test_tpr_lgb) ks_train_lst.append(train_ks) ks_test_lst.append(test_ks) feature_lst[str(rk)] = feature[feature.importance>=20].name train_ks = np.mean(ks_train_lst) test_ks = np.mean(ks_test_lst) ft_lst = {<!-- -->} for i in range(1,6): ft_lst[str(i)] = feature_lst[str(i)] fn_lst=list(set(ft_lst['1']) & amp; set(ft_lst['2']) & set(ft_lst['3']) & set(ft_lst['4']) & set(ft_lst['5'])) print('train_ks: ',train_ks) print('test_ks: ',test_ks) print('ft_lst: ',fn_lst )
Training and drawing
lst = ['person_info','finance_info','credit_info','act_info'] train = data[data.obs_mth != '2018-11-30'].reset_index().copy() evl = data[data.obs_mth == '2018-11-30'].reset_index().copy() x = train[lst] y = train['bad_ind'] evl_x = evl[lst] evl_y = evl['bad_ind'] model,auc = LGB_test(x,y,evl_x,evl_y) y_pred = model.predict_proba(x)[:,1] fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred) train_ks = abs(fpr_lgb_train - tpr_lgb_train).max() print('train_ks : ',train_ks) y_pred = model.predict_proba(evl_x)[:,1] fpr_lgb,tpr_lgb,_ = roc_curve(evl_y,y_pred) evl_ks = abs(fpr_lgb - tpr_lgb).max() print('evl_ks : ',evl_ks) from matplotlib import pyplot as plt plt.plot(fpr_lgb_train,tpr_lgb_train,label = 'train LR') plt.plot(fpr_lgb,tpr_lgb,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show()
3. Probability of good people/probability of bad people
#['person_info','finance_info','credit_info','act_info'] #Calculate fractionsonekey def score(xbeta): score = 1000-500*(math.log2(1-xbeta)/xbeta) #Probability of good people/Probability of bad people return score evl['xbeta'] = model.predict_proba(evl_x)[:,1] evl['score'] = evl.apply(lambda x : score(x.xbeta) ,axis=1)
fpr_lr,tpr_lr,_ = roc_curve(evl_y,evl['score']) evl_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',evl_ks)
4. Generate report
row_num, col_num = 0, 0 bins = 20 Y_predict = evl['score'] Y = evl_y nrows = Y.shape[0] lis = [(Y_predict[i], Y[i]) for i in range(nrows)] ks_lis = sorted(lis, key=lambda x: x[0], reverse=True) bin_num = int(nrows/bins + 1) bad = sum([1 for (p, y) in ks_lis if y > 0.5]) good = sum([1 for (p, y) in ks_lis if y <= 0.5]) bad_cnt, good_cnt = 0, 0 KS = [] BAD = [] GOOD = [] BAD_CNT = [] GOOD_CNT = [] BAD_PCTG = [] BADRATE = [] dct_report = {<!-- -->} for j in range(bins): ds = ks_lis[j*bin_num: min((j + 1)*bin_num, nrows)] bad1 = sum([1 for (p, y) in ds if y > 0.5]) good1 = sum([1 for (p, y) in ds if y <= 0.5]) bad_cnt + = bad1 good_cnt + = good1 bad_pctg = round(bad_cnt/sum(evl_y),3) badrate = round(bad1/(bad1 + good1),3) ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3) KS.append(ks) BAD.append(bad1) GOOD.append(good1) BAD_CNT.append(bad_cnt) GOOD_CNT.append(good_cnt) BAD_PCTG.append(bad_pctg) BADRATE.append(badrate) dct_report['KS'] = KS dct_report['BAD'] = BAD dct_report['GOOD'] = GOOD dct_report['BAD_CNT'] = BAD_CNT dct_report['GOOD_CNT'] = GOOD_CNT dct_report['BAD_PCTG'] = BAD_PCTG dct_report['BADRATE'] = BADRATE val_repot = pd.DataFrame(dct_report) val_repot
5. Behavioral scorecard model performance
from pyecharts.charts import * from pyecharts import options as opts from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] np.set_printoptions(suppress=True) pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) line = ( Line() .add_xaxis(list(val_repot.index)) .add_yaxis( "Proportion of bad guys by group", list(val_repot.BADRATE), yaxis_index=0, color="red", ) .set_global_opts( title_opts=opts.TitleOpts(title="Behavioral scorecard model performance"), ) .extend_axis( yaxis=opts.AxisOpts( name="Cumulative proportion of bad guys", type_="value", min_=0, max_=0.5, position="right", axisline_opts=opts.AxisLineOpts( linestyle_opts=opts.LineStyleOpts(color="red") ), axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .add_xaxis(list(val_repot.index)) .add_yaxis( "KS", list(val_repot['KS']), yaxis_index=1, color="blue", label_opts=opts.LabelOpts(is_show=False), ) ) line.render_notebook()