[Error resolution] CatBoostError: Bad value for num feature[non default doc idx=0,feature idx=19]=

1. Project scenario:

baseline uses CatBoost to achieve prediction.
[AI for Science] Quantum Chemistry: Molecular Property Prediction-First Check-in-Machine Learning Baseline

[AI for Science] Quantum Chemistry: Molecular Property Prediction – Second Check-in – Feature Engineering Baseline Score

2. Problem description:

Original code:

# Import the numpy library for numerical calculations
import numpy as np
#Import the pandas library for data processing and analysis
import pandas as pd
#Import the polars library for processing large-scale data sets
import polars as pl
#Import defaultdict and Counter in the collections library for statistics
from collections import defaultdict, Counter
#Import the CatBoostRegressor library for gradient boosting tree models
from catboost import CatBoostRegressor
#Import StratifiedKFold, KFold and GroupKFold for cross-validation
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
# Use Parallel and delayed in Joblib to implement parallel processing
from joblib import Parallel, delayed
#Import sys, os, gc, argparse and warnings libraries for processing command line parameters and warning messages
import sys, os, gc, argparse, warnings, tqdm
#Mean absolute error) is one of the indicators used to evaluate model performance in regression problems. It measures the average absolute difference between the predicted value and the actual observed value.
from sklearn.metrics import mean_absolute_error
# Ignore warning messages
warnings.filterwarnings('ignore')
path = 'data'
test0 = np.load(f'{<!-- -->path}/QMB_round1_test_230725_0.npy', allow_pickle=True).tolist()
test1 = np.load(f'{<!-- -->path}/QMB_round1_test_230725_1.npy', allow_pickle=True).tolist()
test = test0 + test1
del test0, test1

#A training set is about 20G after loading. You can selectively load it according to your own computing power. The baseline uses A10 and 30G memory environment. Only one training set is loaded.
train0 = np.load(f'{<!-- -->path}/QMB_round1_train_230725_0.npy', allow_pickle=True).tolist()
# train1 = np.load(f'{path}/QMB_round1_train_230725_1.npy', allow_pickle=True).tolist()
# train2 = np.load(f'{path}/QMB_round1_train_230725_2.npy', allow_pickle=True).tolist()
# train3 = np.load(f'{path}/QMB_round1_train_230725_3.npy', allow_pickle=True).tolist()
# train4 = np.load(f'{path}/QMB_round1_train_230725_4.npy', allow_pickle=True).tolist()

# train = train0 + train1 + train2 + train3 + train4
# del train0, train1, train2, train3, train4

train = train0
del train0
def get_parallel_feature(data, IS_TRAIN=False):
    # Longest and largest statistics for lists of connected atoms
    max_len = len(max(data['connectivity'], key=len))
    min_len = len(min(data['connectivity'], key=len))
    
    #Extract the maximum out-degree and in-degree, as well as the number of edges
    # max_out_degree = stats.mode(data['edge_list'][:,0])[1][0]
    # max_in_degree = stats.mode(data['edge_list'][:,1])[1][0]
    edge_list_len = len(data['edge_list'])
    
    #Mean, maximum and minimum values of coordinate positions
    coordinates = data['coordinates'].mean(axis=0).tolist() + \
                  data['coordinates'].max(axis=0).tolist() + \
                  data['coordinates'].min(axis=0).tolist()
    
    #The number of different elements of elements
    elements_nunique = len(set(data['elements']))
    elements = ' '.join([str(i) for i in data['elements']])
    
    # formal_charge maximum and minimum values
    formal_charge = [data['formal_charge'].max(), data['formal_charge'].mean()]
    
    # edge_attr key type proportion
    edge_attr_1_ratio = len(np.where(np.array(data['edge_attr'])=='1')[0]) / edge_list_len
    edge_attr_2_ratio = len(np.where(np.array(data['edge_attr'])=='2')[0]) / edge_list_len
    edge_attr_3_ratio = len(np.where(np.array(data['edge_attr'])=='3')[0]) / edge_list_len
    edge_attr_nunique = len(set(data['edge_attr']))
    
    # Merge into a list
    res = [data['mol_name'], data['atom_count'], data['bond_count'], max_len, min_len, edge_list_len] + \
           coordinates + [elements_nunique, elements] + formal_charge + \
          [edge_attr_1_ratio, edge_attr_2_ratio, edge_attr_3_ratio, edge_attr_nunique]
    
    # Return results
    if IS_TRAIN:
        return res + [data['energy']]
    else:
        return res

### Test Data       
test_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, False)
      for data in tqdm.tqdm(test)
)

test_df = pd.DataFrame(test_samples, columns=['mol_name','atom_count','bond_count','maxlen','maxin','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique'])

### Training data
train_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, True)
      for data in tqdm.tqdm(train)
)

train_df = pd.DataFrame(train_samples, columns=['mol_name','atom_count','bond_count','maxlen','minlen','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique','energy'])
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def tfidf(data, seqs):
    tfidf = TfidfVectorizer(max_df = 0.95, min_df = 1)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_tfidf_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data

def CVec(data, seqs):
    tfidf = CountVectorizer(max_df = 0.95, min_df = 1)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_cv_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data

### Merge training data and test data
test_df['istest'] = 1
train_df['istest'] = 0
df = pd.concat([test_df, train_df], axis=0, ignore_index=True)

### Perform Tfidf and Count
df = tfidf(df, 'elements')
df = CVec(df, 'elements')

### Split training data and test data
test_df = df[df.istest==1].reset_index(drop=True)
train_df = df[df.istest==0].reset_index(drop=True)
def catboost_model(train_x, train_y, test_x, seed = 2023):
    folds = 5
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('********************************** {} ********* ***************************'.format(str(i + 1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        params = {<!-- -->'learning_rate': 0.01,
                  'depth': 12,
                  'bootstrap_type':'Bernoulli',
                  'random_seed':2023,
                  'od_type': 'Iter',
                  'od_wait': 200,
                  'random_seed': 11,
                  'allow_writing_files': False,
                  'task_type':"GPU", #Task type, indicating whether the model runs on GPU or CPU. Setting it to "GPU" means that the model runs on the GPU. If the computer does not have a GPU, it can be set to "CPU".
                  'devices':'0:1'}
        
        #iterations is the number of iterations, which can be adjusted according to your own computing power configuration and energy.
        model = CatBoostRegressor(iterations=10000, **params)
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=500,
                  use_best_model=True,
                  cat_features=[],
                  verbose=1)

        val_pred = model.predict(val_x)
        test_pred = model.predict(test_x)
        
        oof[valid_index] = val_pred
        test_predict + = test_pred / kf.n_splits
        
        score = mean_absolute_error(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
        
        # Get feature importance scores to facilitate feature evaluation
        if i == 0:
            fea_ = model.feature_importances_
            fea_name = model.feature_names_
            fea_score = pd.DataFrame({<!-- -->'fea_name':fea_name, 'score':fea_})
            fea_score = fea_score.sort_values('score', ascending=False)
            fea_score.to_csv('feature_importances.csv', index=False)
        
    return oof, test_predict
cols = [f for f in test_df.columns if f not in ['elements','energy','mol_name','elements','istest']]
cat_oof, cat_test = catboost_model(train_df[cols], train_df['energy'], test_df[cols])
# Output the results of the competition question submission format
test_df['energy'] = cat_test
test_df['force'] = test_df['atom_count'].apply(lambda x: ','.join(['0.0' for _ in range(x*3)]))

test_df[['energy','force']].to_csv("submission1.csv", index=True)

The original code can run normally.

Problem code:

A new feature edge_list has been added, the code is as follows:

def get_parallel_feature(data, IS_TRAIN=False):
    
    #Extract the maximum out-degree and in-degree, as well as the number of edges
    # max_out_degree = stats.mode(data['edge_list'][:,0])[1][0]
    # max_in_degree = stats.mode(data['edge_list'][:,1])[1][0]
    edge_list_len = len(data['edge_list'])
    
    #Mean, maximum and minimum values of coordinate positions
    coordinates = data['coordinates'].mean(axis=0).tolist() + \
                  data['coordinates'].max(axis=0).tolist() + \
                  data['coordinates'].min(axis=0).tolist()
    
    #The number of different elements of elements
    elements_nunique = len(set(data['elements']))
    elements = ' '.join([str(i) for i in data['elements']])
    
    # formal_charge maximum and minimum values
    formal_charge = [data['formal_charge'].max(), data['formal_charge'].mean()]
    
    # edge_attr key type proportion
    edge_attr_1_ratio = len(np.where(np.array(data['edge_attr'])=='1')[0]) / edge_list_len
    edge_attr_2_ratio = len(np.where(np.array(data['edge_attr'])=='2')[0]) / edge_list_len
    edge_attr_3_ratio = len(np.where(np.array(data['edge_attr'])=='3')[0]) / edge_list_len
    edge_attr_nunique = len(set(data['edge_attr']))
    
    # Perform tfidf on connectivity
    idx2element = dict(zip([i for i in range(data['atom_count'])], data['elements']))
  
    # Atomic edges
    edge_li = ' '.join([''.join([str(idx2element[i]*100) for i in li]) for li in data['edge_list']])
    
    # Merge into a list
    res = [data['mol_name'], data['atom_count'], data['bond_count'], edge_list_len] + \
           coordinates + [elements_nunique, elements] + formal_charge + \
          [edge_attr_1_ratio, edge_attr_2_ratio, edge_attr_3_ratio, edge_attr_nunique] + [edge_li]
    
    # Return results
    if IS_TRAIN:
        return res + [data['energy']]
    else:
        return res

### Test Data       
test_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, False)
      for data in tqdm.tqdm(test)
)

test_df = pd.DataFrame(test_samples, columns=['mol_name','atom_count','bond_count','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique','edge_li'])

### Training data
train_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, True)
      for data in tqdm.tqdm(train)
)

train_df = pd.DataFrame(train_samples, columns=['mol_name','atom_count','bond_count','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique','edge_li','energy'])
del train_samples
del test_samples
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def tfidf(data, seqs):
    tfidf = TfidfVectorizer(max_df = 0.95, min_df = 3)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_tfidf_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data

def CVec(data, seqs):
    tfidf = CountVectorizer(max_df = 0.95, min_df = 3)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_cv_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data

### Merge training data and test data
test_df['istest'] = 1
train_df['istest'] = 0
df = pd.concat([test_df, train_df], axis=0, ignore_index=True)
# reduce_mem_usage(train_df)
# reduce_mem_usage(test_df)
# reduce_mem_usage(df)

### Perform Tfidf and Count
#elements
df = tfidf(df,'elements')
reduce_mem_usage(df)

#edge_li
df = tfidf(df,'edge_li')
# reduce_mem_usage(df)

### Split training data and test data
test_df = df[df.istest==1].reset_index(drop=True)
train_df = df[df.istest==0].reset_index(drop=True)
del df

The rest remains unchanged

Question:

I encountered an error when training the CatBoost model:

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=19]="600600 600600 600100 600100 600600 600700 600100 600100 700600 700600 700100 60070 0 600600 600600 600100 600600 600700 600100 600100 700600 700600 700600 600700 600600 600100 600100 600600 600600 600100 60 0100 600600 600600 600100 600100 600700 600600 600100 600100 600600 6001600 600100 600100 1600600 1600600 600600 6001600 60010 0 600100 100600 100600 100600 100600 100700 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 10 0600 100600 100600 100600": Cannot convert 'b'600600 600600 600100 600100 600600 600700 600100 600100 700600 700600 700100 600700 600600 600600 600100 600600 600700 600100 600100 700600 700600 700600 60 0700 600600 600100 600100 600600 600600 600100 600100 600600 600600 600100 600100 600700 600600 600100 600100 600600 600160 0 600100 600100 1600600 1600600 600600 6001600 600100 600100 100600 100600 100600 100600 100700 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600'' to float


If the introduction of the new edge_li feature and subsequent word frequency statistics are removed, there will be no error.

Later, I discovered that ‘edge_li’ was not added to the later molding features, and after adding it, no error was reported.

cols = [f for f in test_df.columns if f not in ['elements','energy','mol_name','elements','edge_li','istest']]
cat_oof, cat_test = catboost_model(train_df[cols], train_df['energy'], test_df[cols])

3. Reason analysis:

When training the CatBoost model, the features must be consistent and in the same order. Pay attention when modifying the features.