LightGBM-deuce card

Article directory

1. Data set processing
2. Define the model
- training and drawing
3. Probability of good people/probability of bad people
4. Generate reports
5. Performance of behavioral scorecard model
Summarize

1. Data set processing

import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import numpy as np
import random
import math
import time
import lightgbm as lgb

data = pd.read_csv('Bcard.txt')
data.head()

#Look at the monthly distribution, we use the last month as the cross-time verification set
data.obs_mth.unique()

df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()

#These are all our variables. The end of info is the personal performance output by the unsupervised system we made, and the end of score is the paid external credit data.
lst = ['person_info','finance_info','credit_info','act_info','td_score','jxl_score','mj_score','rh_score']

df_train = df_train.sort_values(by = 'obs_mth', ascending = False)

rank_lst = []
for i in range(1,len(df_train) + 1):
    rank_lst.append(i)
    
df_train['rank'] = rank_lst

df_train['rank'] = df_train['rank']/len(df_train)

pct_lst = []
for x in df_train['rank']:
    if x <= 0.2:
        x = 1
    elif x <= 0.4:
        x = 2
    elif x <= 0.6:
        x = 3
    elif x <= 0.8:
        x = 4
    else:
        x=5
    pct_lst.append(x)
df_train['rank'] = pct_lst
#train = train.drop('obs_mth',axis = 1)
df_train.head()

1. Use the sort_values() function to sort df_train in descending order according to the ‘obs_mth’ column. This means that observations with newer months will be ranked first.
2. Create a list named rank_lst containing all integers from 1 to len(df_train). This is done to subsequently assign a rank to each row of the DataFrame.
3. Using list comprehensions, assign each element in rank_lst to the ‘rank’ column of df_train. In this way, the ‘rank’ column of each row represents the ranking of the row in the sorted DataFrame.
4. Divide the ‘rank’ column of df_train by len(df_train) to convert it into a percentage representation. In this way, the ‘rank’ column of each row represents the ranking percentage of the row in the sorted DataFrame.
5. Create a new list pct_lst containing the converted percentages.
6. Use list comprehensions to assign each element in pct_lst to the ‘rank’ column of df_train based on the value of the ‘rank’ column of df_train. In this way, the ‘rank’ column of each row represents the ranking percentage of the row in the sorted DataFrame.
7. Finally, delete the ‘obs_mth’ column in df_train as this is no longer needed.

df_train['rank'].groupby(df_train['rank']).count()

2. Define the model

#Define lgb function
def LGB_test(train_x,train_y,test_x,test_y):
    from multiprocessing import cpu_count
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=2, n_estimators=800,max_features = 140, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,
        num_iterations = 800 #Number of iterations
    )
    clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc')
    print(clf.n_features_)

    return clf,clf.best_score_[ 'valid_1']['auc']
feature_lst = {<!-- -->}
ks_train_lst = []
ks_test_lst = []
for rk in set(df_train['rank']):
    
    # The test set will be used as a cross-time validation set after 8.18
    
    #Define model training set and test set
    ttest = df_train[df_train['rank'] == rk]
    ttrain = df_train[df_train['rank'] != rk]
    
    train = ttrain[lst]
    train_y = ttrain.bad_ind
    
    test = ttest[lst]
    test_y = ttest.bad_ind
    
    start = time.time()
    model,auc = LGB_test(train,train_y,test,test_y)
    end = time.time()
    
    #Model contribution is placed in feature
    feature = pd.DataFrame(
                {<!-- -->'name' : model.booster_.feature_name(),
                'importance' : model.feature_importances_
              }).sort_values(by = ['importance'], ascending = False)
    
       
    #Calculate KS and AUC on the training set, test set, and validation set

    y_pred_train_lgb = model.predict_proba(train)[:, 1]
    y_pred_test_lgb = model.predict_proba(test)[:, 1]


    train_fpr_lgb, train_tpr_lgb, _ = roc_curve(train_y, y_pred_train_lgb)
    test_fpr_lgb, test_tpr_lgb, _ = roc_curve(test_y, y_pred_test_lgb)


    train_ks = abs(train_fpr_lgb - train_tpr_lgb).max()
    test_ks = abs(test_fpr_lgb - test_tpr_lgb).max()


    train_auc = metrics.auc(train_fpr_lgb, train_tpr_lgb)
    test_auc = metrics.auc(test_fpr_lgb, test_tpr_lgb)
    
    ks_train_lst.append(train_ks)
    ks_test_lst.append(test_ks)

    feature_lst[str(rk)] = feature[feature.importance>=20].name
train_ks = np.mean(ks_train_lst)
test_ks = np.mean(ks_test_lst)

ft_lst = {<!-- -->}
for i in range(1,6):
    ft_lst[str(i)] = feature_lst[str(i)]

fn_lst=list(set(ft_lst['1']) & amp; set(ft_lst['2'])
     & set(ft_lst['3']) & set(ft_lst['4']) & set(ft_lst['5']))

print('train_ks: ',train_ks)
print('test_ks: ',test_ks)

print('ft_lst: ',fn_lst )

Training and drawing

lst = ['person_info','finance_info','credit_info','act_info']

train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
evl = data[data.obs_mth == '2018-11-30'].reset_index().copy()

x = train[lst]
y = train['bad_ind']

evl_x = evl[lst]
evl_y = evl['bad_ind']

model,auc = LGB_test(x,y,evl_x,evl_y)

y_pred = model.predict_proba(x)[:,1]
fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred)
train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()
print('train_ks : ',train_ks)

y_pred = model.predict_proba(evl_x)[:,1]
fpr_lgb,tpr_lgb,_ = roc_curve(evl_y,y_pred)
evl_ks = abs(fpr_lgb - tpr_lgb).max()
print('evl_ks : ',evl_ks)

from matplotlib import pyplot as plt
plt.plot(fpr_lgb_train,tpr_lgb_train,label = 'train LR')
plt.plot(fpr_lgb,tpr_lgb,label = 'evl LR')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()

3. Probability of good people/probability of bad people

#['person_info','finance_info','credit_info','act_info']
#Calculate fractionsonekey
def score(xbeta):
    score = 1000-500*(math.log2(1-xbeta)/xbeta) #Probability of good people/Probability of bad people
    return score
evl['xbeta'] = model.predict_proba(evl_x)[:,1]
evl['score'] = evl.apply(lambda x : score(x.xbeta) ,axis=1)

fpr_lr,tpr_lr,_ = roc_curve(evl_y,evl['score'])
evl_ks = abs(fpr_lr - tpr_lr).max()
print('val_ks : ',evl_ks)

4. Generate report

row_num, col_num = 0, 0
bins = 20
Y_predict = evl['score']
Y = evl_y
nrows = Y.shape[0]
lis = [(Y_predict[i], Y[i]) for i in range(nrows)]
ks_lis = sorted(lis, key=lambda x: x[0], reverse=True)
bin_num = int(nrows/bins + 1)
bad = sum([1 for (p, y) in ks_lis if y > 0.5])
good = sum([1 for (p, y) in ks_lis if y <= 0.5])
bad_cnt, good_cnt = 0, 0
KS = []
BAD = []
GOOD = []
BAD_CNT = []
GOOD_CNT = []
BAD_PCTG = []
BADRATE = []
dct_report = {<!-- -->}
for j in range(bins):
    ds = ks_lis[j*bin_num: min((j + 1)*bin_num, nrows)]
    bad1 = sum([1 for (p, y) in ds if y > 0.5])
    good1 = sum([1 for (p, y) in ds if y <= 0.5])
    bad_cnt + = bad1
    good_cnt + = good1
    bad_pctg = round(bad_cnt/sum(evl_y),3)
    badrate = round(bad1/(bad1 + good1),3)
    ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3)
    KS.append(ks)
    BAD.append(bad1)
    GOOD.append(good1)
    BAD_CNT.append(bad_cnt)
    GOOD_CNT.append(good_cnt)
    BAD_PCTG.append(bad_pctg)
    BADRATE.append(badrate)
    dct_report['KS'] = KS
    dct_report['BAD'] = BAD
    dct_report['GOOD'] = GOOD
    dct_report['BAD_CNT'] = BAD_CNT
    dct_report['GOOD_CNT'] = GOOD_CNT
    dct_report['BAD_PCTG'] = BAD_PCTG
    dct_report['BADRATE'] = BADRATE
val_repot = pd.DataFrame(dct_report)
val_repot

5. Behavioral scorecard model performance

from pyecharts.charts import *
from pyecharts import options as opts
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
np.set_printoptions(suppress=True)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
line = (

    Line()
    .add_xaxis(list(val_repot.index))
    .add_yaxis(
        "Proportion of bad guys by group",
        list(val_repot.BADRATE),
        yaxis_index=0,
        color="red",
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Behavioral scorecard model performance"),
    )
    .extend_axis(
        yaxis=opts.AxisOpts(
            name="Cumulative proportion of bad guys",
            type_="value",
            min_=0,
            max_=0.5,
            position="right",
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(color="red")
            ),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
        )

    )
    .add_xaxis(list(val_repot.index))
    .add_yaxis(
        "KS",
        list(val_repot['KS']),
        yaxis_index=1,
        color="blue",
        label_opts=opts.LabelOpts(is_show=False),
    )
)
line.render_notebook()