1. Data import
import pandas as pd import numpy as np import warnings from imblearn import under_sampling, over_sampling from imblearn.over_sampling import SMOTE warnings. filter warnings('ignore') pd.options.display.max_columns = None #Display all columns pd.set_option('display.float_format', lambda x: '%.2f' % x) #Cancel scientific notation train_data = pd.read_csv('Training.csv') test_data = pd.read_csv('Test.csv')
2. Data cleaning
#Merge data total_data = pd. concat([train_data, test_data]) # Replace . and ? total_data = total_data.replace('.', np.nan) total_data = total_data.replace('?', np.nan) print(total_data[['area']].info())
According to the data dictionary, we can know a few points:
1), area is a character string, code 1 of the area code is the major classification, code 1 + 2 is the middle classification, and so on. So in theory it should be 3 digits, we need to convert the abnormal data into a null value, and extract the first three digits respectively;
2) The fields in the yellow part are related, such as ck-saveall = ck-savetime × ck-saveavg, so it can be filled by calculation;
3) The ck field is related to the field with the word ck behind it. In theory, as long as there is a value > 0 in the following field, ck is 1;
4) Other fields should also have internal connections, but because they don’t know the banking industry, they dare not move.
5), it is not recommended to replace the missing value with the mean value, because I will use the LightGBM algorithm later, so it can not be filled; if RandomForest is used, it is recommended to fill the missing value with -1
total_data['area'].where(total_data['area'].str.len()>=3, inplace=True) print(total_data[['area']].info())
# Convert some object variables to numerical variables total_data.rename(columns={<!-- -->'depsaveavg':'dep-saveavg', 'depdrawavg': 'dep-drawavg'}, inplace=True) num_features = list(set(total_data.columns) - set(['ID', 'area', 'ck', 'comp', 'VV'])) for col in num_features: total_data[col] = pd.to_numeric(total_data[col]) total_data. info()
def cap(x, quantile=[0.05, 0.95]): """Block method to deal with outliers Args: x: pd.Series column, continuous variable quantile: Specifies the upper and lower quantile range of the capping method """ # Generate quantiles Q05, Q95=x.quantile(quantile).values.tolist() # Replace outliers with specified quantiles if Q05 > x.min(): x = x.copy() x.loc[x<Q05] = Q05 if Q95 < x.max(): x = x.copy() x.loc[x>Q95] = Q95 return(x) columns = total_data.columns.tolist() # Cap the training set total_data_1 = total_data[num_features] total_data_2 = total_data_1.apply(cap) new_total_data = pd.concat([total_data[['ID', 'area', 'ck', 'comp', 'VV']], total_data_2], axis=1) total_data = new_total_data[columns] total_data. head()
#According to the rules, supplement the data of 'ck' and 'dep' related fields for way in ['ck-save', 'ck-draw', 'dep-save', 'dep-draw']: total_data['new_{}all'.format(way)] = total_data['{}time'.format(way)] * total_data['{}avg'.format(way)] total_data['new_{}time'.format(way)] = total_data['{}all'.format(way)] / total_data['{}avg'.format(way)] total_data['new_{}avg'.format(way)] = total_data['{}all'.format(way)] / total_data['{}time'.format(way)] total_data.loc[total_data['{}all'.format(way)].isnull(),'{}all'.format(way)] = total_data[total_data['{}all'. format(way)].isnull()]['new_{}all'.format(way)] total_data.loc[total_data['{}time'.format(way)].isnull(),'{}time'.format(way)] = total_data[total_data['{}time'. format(way)].isnull()]['new_{}time'.format(way)] total_data.loc[total_data['{}avg'.format(way)].isnull(),'{}avg'.format(way)] = total_data[total_data['{}avg'. format(way)].isnull()]['new_{}avg'.format(way)] print(total_data. info())
#Supplement the data of ck print(total_data['ck']. value_counts())
total_data.loc[(total_data['ck-saveall']>0)|(total_data['ck-drawall']>0)|(total_data['ck-drawtime']>0 )|(total_data['ck-saveavg']>0) |(total_data['ck-drawavg']>0)|(total_data['ck-savetime']>0)|(total_data['ck-changame']>0)|(total_data[\ 'ck-changtime']>0) |(total_data['ck-avg']>0), 'ck'] = '1' print(total_data['ck']. value_counts())
3. Data integration and upsampling
Explanation: Because the scoring method is based on the F-Measure of small and medium-sized enterprises with capital needs to evaluate the quality of the prediction results, and the samples are extremely unbalanced, it is necessary to adjust the data distribution through upsampling. I used the smote method here ( In the exam, the time is too late to directly be a sample of 1 × 40)
#filter useful feature values cate_features = ['area', 'ck', 'comp'] predictors = num_features + cate_features all_columns = predictors + ['ID', 'VV'] total_data = total_data[all_columns] total_data = total_data.fillna(-1) for col in category_features: total_data[col] = pd.to_numeric(total_data[col]) new_train_data = total_data[total_data['VV'] != 'Withheld'] new_test_data = total_data[total_data['VV'] == 'Withheld'] # upsampling smo = SMOTE(random_state=42) new_train_data['VV'] = new_train_data['VV'].astype(int) X_smo, y_smo = smo.fit_sample(new_train_data[predictors], new_train_data['VV']) last_train_data = pd. concat([X_smo, y_smo], axis=1) last_train_data. info()
last_train_data.head()
4. Modeling prediction
(1) LightGBM
#simple prediction from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.metrics import classification_report import lightgbm as lgb params = {<!-- -->'num_leaves': 30, #The result has a greater impact on the final effect, the larger the value, the better, if it is too large, there will be overfitting 'min_data_in_leaf': 30, 'objective': 'binary', #defined objective function 'max_depth': -1, 'learning_rate': 0.01, "min_sum_hessian_in_leaf": 6, "boosting": "gbdt", "feature_fraction": 0.8, #Extracted feature ratio "bagging_freq": 1, "bagging_fraction": 0.8, "bagging_seed": 11, "lambda_l1": 0.1, #l1 regular # 'lambda_l2': 0.001, #l2 regular "verbosity": -1, "nthread": -1, #The number of threads, -1 means all threads, the more threads, the faster the running speed 'metric': {<!-- -->'binary_logloss'}, ##evaluation function selection "random_state": 2020, #Random number seed, which can prevent inconsistent results of each run # 'device': 'gpu' ##If you install the gpu version of lightgbm, you can speed up the calculation } X_train, X_val, y_train, y_val = train_test_split(last_train_data[predictors], last_train_data["VV"], test_size=0.2, random_state=2020) training_data = lgb.Dataset(X_train, label=y_train) val_data = lgb.Dataset(X_val, label=y_val, reference=training_data) evals_result = {<!-- -->} # used to record training results model = lgb. train(params, training_data, num_boost_round=10000, valid_sets = val_data, early_stopping_rounds=100, categorical_feature = cate_features, evals_result = evals_result, verbose_eval=500) val_pred = model. predict(X_val) val_pred = np.where(val_pred>=0.5, 1, 0) val_true = y_val.as_matrix() print(classification_report(val_true,val_pred)) test_pred = model. predict(new_test_data[predictors]) test_pred = np.where(test_pred>=0.5, 1, 0) print(sum(test_pred)) print(len(test_pred)) answer = new_test_data. copy() answer['VV'] = test_pred answer[['ID', 'VV']].to_csv('results.csv', index=False)
(2) Random Forest
#Using RF for simple prediction from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold X_train, X_val, y_train, y_val = train_test_split(new_train_data[predictors], new_train_data["VV"], test_size=0.2, random_state=2020, stratify=new_train_data["VV"]) rf = RandomForestClassifier(n_estimators=50, min_samples_split=5, min_samples_leaf=3) rf. fit(X_train, y_train) print(accuracy_score(rf. predict(X_val), y_val)) pred = rf. predict(new_test_data[predictors]) new_test_data['VV'] = pred new_test_data[['ID', 'VV']].to_csv('up_answer.csv', index=False)
print(sum(pred)) print(len(pred))
(3) LightGBM 5-fold cross
#5-fold cross-validation import lightgbm as lgb from sklearn.model_selection import KFold from sklearn.metrics import f1_score train_x = last_train_data[predictors] train_y = last_train_data['VV'] test_x = new_test_data[predictors] X, y, X_test = train_x.values, train_y.values, test_x.values # convert to np.array type def self_metric(labels, preds): preds = preds. get_label() pred = np.where(preds>=0.5, 1, 0) return f1_score(labels, preds) param = {<!-- -->'num_leaves': 30, #The result has a greater impact on the final effect, the larger the value, the better, if it is too large, there will be overfitting 'min_data_in_leaf': 30, 'objective': 'binary', #defined objective function 'max_depth': -1, 'learning_rate': 0.01, "min_sum_hessian_in_leaf": 6, "boosting": "gbdt", "feature_fraction": 0.8, #Extracted feature ratio "bagging_freq": 1, "bagging_fraction": 0.8, "bagging_seed": 11, "lambda_l1": 0.1, #l1 regular # 'lambda_l2': 0.001, #l2 regular "verbosity": -1, "nthread": -1, #The number of threads, -1 means all threads, the more threads, the faster the running speed 'metric': {<!-- -->'binary_logloss'}, ##evaluation function selection "random_state": 2020, #Random number seed, which can prevent inconsistent results of each run # 'device': 'gpu' ##If you install the gpu version of lightgbm, you can speed up the calculation } # 50-fold cross-validation folds = KFold(n_splits=5, shuffle=True, random_state=36) folds
predictions = [] #predicted value of the test for fold_, (train_index, test_index) in enumerate(folds. split(X, y)): print("{} cross-validation:".format(fold_ + 1)) X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index] training_data = lgb.Dataset(X_train, label=y_train) # training data validation_data = lgb.Dataset(X_valid, label=y_valid) # validation data clf = lgb. train(param, training_data, num_boost_round=10000, valid_sets=[validation_data], verbose_eval=1000, early_stopping_rounds=100, # feval = self_metric ) x_pred = clf.predict(X_valid, num_iteration=clf.best_iteration) x_pred = np.where(x_pred>0.5,1,0) print(f1_score(y_valid, x_pred)) y_test = clf.predict(X_test, num_iteration=clf.best_iteration) # prediction # print(y_test[:10]) predictions.append(y_test) final_scoreList = [] for i in range(0, 6537): final_score = (predictions[0][i] + predictions[1][i] + predictions[2][i] + predictions[3][i] + predictions[4][i]) / 5 final_scoreList.append(final_score) # print(final_scoreList[:10]) pred1 = np.array(final_scoreList) pred = np.where(pred1>=0.5, 1, 0) print(sum(pred)) print(len(pred))
(4) LGBMClassifier 50% off cross
from sklearn.model_selection import KFold from sklearn.metrics import f1_score from lightgbm import LGBMClassifier import lightgbm as lgb from scipy import stats X, y, X_test = last_train_data[predictors].values, last_train_data['VV'], new_test_data[predictors].values # Convert to np.array type folds = KFold(n_splits=5, shuffle=True, random_state=36) predictions = [] #last predicted value for k, (train_index, test_index) in enumerate(folds. split(X, y)): print("{}th cross-validation:".format(k + 1)) X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index] clg = LGBMClassifier( boosting="gbdt", learning_rate=0.1, colsample_bytree=0.8, # max_depth=5, # n_estimators=100, num_leaves=31, lambda_l1=0.1, lambda_l2=0.1, seed=0 ) clg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=-1) train_pred = clg. predict(X_train) valid_pred = clg. predict(X_valid) print("Score of this round of training set: %.2f%%"%(f1_score(y_train,train_pred)*100)) print("Validation set score of this round: %.2f%%"%(f1_score(y_valid,valid_pred)*100)) pred = clg. predict(X_test) predictions.append(pred) last_pred = stats. mode(predictions)[0][0] new_test_data['VV'] = last_pred new_test_data[['ID', 'VV']].to_csv('answer.csv', index=False)
print(sum(last_pred)) print(len(last_pred))