1. Project scenario:
baseline uses CatBoost to achieve prediction.
[AI for Science] Quantum Chemistry: Molecular Property Prediction-First Check-in-Machine Learning Baseline
[AI for Science] Quantum Chemistry: Molecular Property Prediction – Second Check-in – Feature Engineering Baseline Score
2. Problem description:
Original code:
# Import the numpy library for numerical calculations import numpy as np #Import the pandas library for data processing and analysis import pandas as pd #Import the polars library for processing large-scale data sets import polars as pl #Import defaultdict and Counter in the collections library for statistics from collections import defaultdict, Counter #Import the CatBoostRegressor library for gradient boosting tree models from catboost import CatBoostRegressor #Import StratifiedKFold, KFold and GroupKFold for cross-validation from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold # Use Parallel and delayed in Joblib to implement parallel processing from joblib import Parallel, delayed #Import sys, os, gc, argparse and warnings libraries for processing command line parameters and warning messages import sys, os, gc, argparse, warnings, tqdm #Mean absolute error) is one of the indicators used to evaluate model performance in regression problems. It measures the average absolute difference between the predicted value and the actual observed value. from sklearn.metrics import mean_absolute_error # Ignore warning messages warnings.filterwarnings('ignore')
path = 'data' test0 = np.load(f'{<!-- -->path}/QMB_round1_test_230725_0.npy', allow_pickle=True).tolist() test1 = np.load(f'{<!-- -->path}/QMB_round1_test_230725_1.npy', allow_pickle=True).tolist() test = test0 + test1 del test0, test1 #A training set is about 20G after loading. You can selectively load it according to your own computing power. The baseline uses A10 and 30G memory environment. Only one training set is loaded. train0 = np.load(f'{<!-- -->path}/QMB_round1_train_230725_0.npy', allow_pickle=True).tolist() # train1 = np.load(f'{path}/QMB_round1_train_230725_1.npy', allow_pickle=True).tolist() # train2 = np.load(f'{path}/QMB_round1_train_230725_2.npy', allow_pickle=True).tolist() # train3 = np.load(f'{path}/QMB_round1_train_230725_3.npy', allow_pickle=True).tolist() # train4 = np.load(f'{path}/QMB_round1_train_230725_4.npy', allow_pickle=True).tolist() # train = train0 + train1 + train2 + train3 + train4 # del train0, train1, train2, train3, train4 train = train0 del train0
def get_parallel_feature(data, IS_TRAIN=False): # Longest and largest statistics for lists of connected atoms max_len = len(max(data['connectivity'], key=len)) min_len = len(min(data['connectivity'], key=len)) #Extract the maximum out-degree and in-degree, as well as the number of edges # max_out_degree = stats.mode(data['edge_list'][:,0])[1][0] # max_in_degree = stats.mode(data['edge_list'][:,1])[1][0] edge_list_len = len(data['edge_list']) #Mean, maximum and minimum values of coordinate positions coordinates = data['coordinates'].mean(axis=0).tolist() + \ data['coordinates'].max(axis=0).tolist() + \ data['coordinates'].min(axis=0).tolist() #The number of different elements of elements elements_nunique = len(set(data['elements'])) elements = ' '.join([str(i) for i in data['elements']]) # formal_charge maximum and minimum values formal_charge = [data['formal_charge'].max(), data['formal_charge'].mean()] # edge_attr key type proportion edge_attr_1_ratio = len(np.where(np.array(data['edge_attr'])=='1')[0]) / edge_list_len edge_attr_2_ratio = len(np.where(np.array(data['edge_attr'])=='2')[0]) / edge_list_len edge_attr_3_ratio = len(np.where(np.array(data['edge_attr'])=='3')[0]) / edge_list_len edge_attr_nunique = len(set(data['edge_attr'])) # Merge into a list res = [data['mol_name'], data['atom_count'], data['bond_count'], max_len, min_len, edge_list_len] + \ coordinates + [elements_nunique, elements] + formal_charge + \ [edge_attr_1_ratio, edge_attr_2_ratio, edge_attr_3_ratio, edge_attr_nunique] # Return results if IS_TRAIN: return res + [data['energy']] else: return res ### Test Data test_samples = Parallel(n_jobs=40)( delayed(get_parallel_feature)(data, False) for data in tqdm.tqdm(test) ) test_df = pd.DataFrame(test_samples, columns=['mol_name','atom_count','bond_count','maxlen','maxin','edgelen',\ 'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\ 'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\ 'edge_attr_nunique']) ### Training data train_samples = Parallel(n_jobs=40)( delayed(get_parallel_feature)(data, True) for data in tqdm.tqdm(train) ) train_df = pd.DataFrame(train_samples, columns=['mol_name','atom_count','bond_count','maxlen','minlen','edgelen',\ 'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\ 'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\ 'edge_attr_nunique','energy'])
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer def tfidf(data, seqs): tfidf = TfidfVectorizer(max_df = 0.95, min_df = 1) res = tfidf.fit_transform(data[seqs]) res = res.toarray() for i in range(len(res[0])): data['{}_tfidf_{}'.format(seqs,str(i))] = res[:,i] gc.collect() return data def CVec(data, seqs): tfidf = CountVectorizer(max_df = 0.95, min_df = 1) res = tfidf.fit_transform(data[seqs]) res = res.toarray() for i in range(len(res[0])): data['{}_cv_{}'.format(seqs,str(i))] = res[:,i] gc.collect() return data ### Merge training data and test data test_df['istest'] = 1 train_df['istest'] = 0 df = pd.concat([test_df, train_df], axis=0, ignore_index=True) ### Perform Tfidf and Count df = tfidf(df, 'elements') df = CVec(df, 'elements') ### Split training data and test data test_df = df[df.istest==1].reset_index(drop=True) train_df = df[df.istest==0].reset_index(drop=True)
def catboost_model(train_x, train_y, test_x, seed = 2023): folds = 5 kf = KFold(n_splits=folds, shuffle=True, random_state=seed) oof = np.zeros(train_x.shape[0]) test_predict = np.zeros(test_x.shape[0]) cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): print('********************************** {} ********* ***************************'.format(str(i + 1))) trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] params = {<!-- -->'learning_rate': 0.01, 'depth': 12, 'bootstrap_type':'Bernoulli', 'random_seed':2023, 'od_type': 'Iter', 'od_wait': 200, 'random_seed': 11, 'allow_writing_files': False, 'task_type':"GPU", #Task type, indicating whether the model runs on GPU or CPU. Setting it to "GPU" means that the model runs on the GPU. If the computer does not have a GPU, it can be set to "CPU". 'devices':'0:1'} #iterations is the number of iterations, which can be adjusted according to your own computing power configuration and energy. model = CatBoostRegressor(iterations=10000, **params) model.fit(trn_x, trn_y, eval_set=(val_x, val_y), metric_period=500, use_best_model=True, cat_features=[], verbose=1) val_pred = model.predict(val_x) test_pred = model.predict(test_x) oof[valid_index] = val_pred test_predict + = test_pred / kf.n_splits score = mean_absolute_error(val_y, val_pred) cv_scores.append(score) print(cv_scores) # Get feature importance scores to facilitate feature evaluation if i == 0: fea_ = model.feature_importances_ fea_name = model.feature_names_ fea_score = pd.DataFrame({<!-- -->'fea_name':fea_name, 'score':fea_}) fea_score = fea_score.sort_values('score', ascending=False) fea_score.to_csv('feature_importances.csv', index=False) return oof, test_predict cols = [f for f in test_df.columns if f not in ['elements','energy','mol_name','elements','istest']] cat_oof, cat_test = catboost_model(train_df[cols], train_df['energy'], test_df[cols])
# Output the results of the competition question submission format test_df['energy'] = cat_test test_df['force'] = test_df['atom_count'].apply(lambda x: ','.join(['0.0' for _ in range(x*3)])) test_df[['energy','force']].to_csv("submission1.csv", index=True)
The original code can run normally.
Problem code:
A new feature edge_list has been added, the code is as follows:
def get_parallel_feature(data, IS_TRAIN=False): #Extract the maximum out-degree and in-degree, as well as the number of edges # max_out_degree = stats.mode(data['edge_list'][:,0])[1][0] # max_in_degree = stats.mode(data['edge_list'][:,1])[1][0] edge_list_len = len(data['edge_list']) #Mean, maximum and minimum values of coordinate positions coordinates = data['coordinates'].mean(axis=0).tolist() + \ data['coordinates'].max(axis=0).tolist() + \ data['coordinates'].min(axis=0).tolist() #The number of different elements of elements elements_nunique = len(set(data['elements'])) elements = ' '.join([str(i) for i in data['elements']]) # formal_charge maximum and minimum values formal_charge = [data['formal_charge'].max(), data['formal_charge'].mean()] # edge_attr key type proportion edge_attr_1_ratio = len(np.where(np.array(data['edge_attr'])=='1')[0]) / edge_list_len edge_attr_2_ratio = len(np.where(np.array(data['edge_attr'])=='2')[0]) / edge_list_len edge_attr_3_ratio = len(np.where(np.array(data['edge_attr'])=='3')[0]) / edge_list_len edge_attr_nunique = len(set(data['edge_attr'])) # Perform tfidf on connectivity idx2element = dict(zip([i for i in range(data['atom_count'])], data['elements'])) # Atomic edges edge_li = ' '.join([''.join([str(idx2element[i]*100) for i in li]) for li in data['edge_list']]) # Merge into a list res = [data['mol_name'], data['atom_count'], data['bond_count'], edge_list_len] + \ coordinates + [elements_nunique, elements] + formal_charge + \ [edge_attr_1_ratio, edge_attr_2_ratio, edge_attr_3_ratio, edge_attr_nunique] + [edge_li] # Return results if IS_TRAIN: return res + [data['energy']] else: return res ### Test Data test_samples = Parallel(n_jobs=40)( delayed(get_parallel_feature)(data, False) for data in tqdm.tqdm(test) ) test_df = pd.DataFrame(test_samples, columns=['mol_name','atom_count','bond_count','edgelen',\ 'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\ 'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\ 'edge_attr_nunique','edge_li']) ### Training data train_samples = Parallel(n_jobs=40)( delayed(get_parallel_feature)(data, True) for data in tqdm.tqdm(train) ) train_df = pd.DataFrame(train_samples, columns=['mol_name','atom_count','bond_count','edgelen',\ 'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\ 'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\ 'edge_attr_nunique','edge_li','energy']) del train_samples del test_samples
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer def tfidf(data, seqs): tfidf = TfidfVectorizer(max_df = 0.95, min_df = 3) res = tfidf.fit_transform(data[seqs]) res = res.toarray() for i in range(len(res[0])): data['{}_tfidf_{}'.format(seqs,str(i))] = res[:,i] gc.collect() return data def CVec(data, seqs): tfidf = CountVectorizer(max_df = 0.95, min_df = 3) res = tfidf.fit_transform(data[seqs]) res = res.toarray() for i in range(len(res[0])): data['{}_cv_{}'.format(seqs,str(i))] = res[:,i] gc.collect() return data ### Merge training data and test data test_df['istest'] = 1 train_df['istest'] = 0 df = pd.concat([test_df, train_df], axis=0, ignore_index=True) # reduce_mem_usage(train_df) # reduce_mem_usage(test_df) # reduce_mem_usage(df) ### Perform Tfidf and Count #elements df = tfidf(df,'elements') reduce_mem_usage(df) #edge_li df = tfidf(df,'edge_li') # reduce_mem_usage(df) ### Split training data and test data test_df = df[df.istest==1].reset_index(drop=True) train_df = df[df.istest==0].reset_index(drop=True) del df
The rest remains unchanged
Question:
I encountered an error when training the CatBoost model:
CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=19]="600600 600600 600100 600100 600600 600700 600100 600100 700600 700600 700100 60070 0 600600 600600 600100 600600 600700 600100 600100 700600 700600 700600 600700 600600 600100 600100 600600 600600 600100 60 0100 600600 600600 600100 600100 600700 600600 600100 600100 600600 6001600 600100 600100 1600600 1600600 600600 6001600 60010 0 600100 100600 100600 100600 100600 100700 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 10 0600 100600 100600 100600": Cannot convert 'b'600600 600600 600100 600100 600600 600700 600100 600100 700600 700600 700100 600700 600600 600600 600100 600600 600700 600100 600100 700600 700600 700600 60 0700 600600 600100 600100 600600 600600 600100 600100 600600 600600 600100 600100 600700 600600 600100 600100 600600 600160 0 600100 600100 1600600 1600600 600600 6001600 600100 600100 100600 100600 100600 100600 100700 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600'' to float
If the introduction of the new edge_li feature and subsequent word frequency statistics are removed, there will be no error.
Later, I discovered that ‘edge_li’ was not added to the later molding features, and after adding it, no error was reported.
cols = [f for f in test_df.columns if f not in ['elements','energy','mol_name','elements','edge_li','istest']] cat_oof, cat_test = catboost_model(train_df[cols], train_df['energy'], test_df[cols])
3. Reason analysis:
When training the CatBoost model, the features must be consistent and in the same order. Pay attention when modifying the features.