Pre-trained ernie model fine-tuning sentence classification practice

1. Preparation of data set

First prepare our own data set, I asked chatgpt to help generate some

{
    "title":"Zun Du Fake Du",
    "data": [{"text": "I love black silk beauties","labels": 2},
              {"text": "I love white silk beauty","labels": 1},
              {"text": "Black silk beauty is really sexy","labels": 2},
              {"text": "White silk beauty is also very charming","labels": 1},
              {"text": "Net stockings make beautiful legs more attractive","labels": 0},
              {"text": "Black silk and white silk are both good-looking","labels": 3},
              {"text": "The beauty in black silk makes my heart move","labels": 3},
              {"text": "The beauty with white silk makes me unable to bear to look at it","labels": 1},
              {"text": "Which one looks better, black silk or white silk?","labels": 3},
              {"text": "I like girls in black silk","labels": 2},
              {"text": "I think white silk is more suitable for me","labels": 1},
              {"text": "Black silk and white silk have different charms","labels": 3}]
}

The file name is dummydata and the label here is like this:

0 Net socks
1 White silk
2 Black Silk
3 white silk + black silk

1. Import the packages used

import evaluate
import torch.utils.data
from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorWithPadding, AutoTokenizer
from torch.utils.data import DataLoader

2. Use the Datasets library to load the data set

1. Read custom json data set file

This is the location of my json file: F:\bert intent recognition\data\dummydata.jsonl

def load_datasets(test_size: float = 0.2) -> DatasetDict[str, Dataset]:
    assert 0 < test_size < 1, 'value must in range (0-1)'
    data = load_dataset('json', data_files='../data/dummydata.jsonl', field='data')
    train_test_valid = data['train'].train_test_split(test_size=0.1)
    dataset = DatasetDict({
        "train": train_test_valid["train"],
        "test": train_test_valid["test"],
        "valid": train_test_valid["train"]})
    return dataset

Here it is set up to automatically divide the data set into a training set and a test set according to a ratio of 0.1.

Then dataset is a DatasetDict class which contains training set, test set, and verification set (the content of the verification set is the same as the training set)

2. Process and return dataloader

def get_dataloaders(tokenizer, batch_size) -> dict[str:torch.utils.data.DataLoader]:
    #Here we set the tokenizer to automatically change the data set to index and padding. The returned format is pt.
    tokenize_func = lambda x: tokenizer(x["text"], padding=True, truncation=True, return_tensors="pt")
    #The load_datasets here is the previous function
    dataset = load_datasets()
    #Use the map in the class function to operate the dataset
    tokenized_datasets = dataset.map(tokenize_func, batched=True)
    #This remove is to remove the text in the data set and only retain the parameters that the bert class can accept.
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    #pytorch used here
    tokenized_datasets.set_format("torch")
    # collect a Dataloader
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["valid"], batch_size=batch_size, collate_fn=data_collator
    )
    test_dataloader = DataLoader(
        tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator
    )
    return {
        "train": train_dataloader,
        "valid": eval_dataloader,
        "test": test_dataloader

This function returns three iterable Dataloaders in the torch that can be directly used for training and other operations.

2. Training train.py

1. Import the packages that need to be used

import os
import torch
import warnings
import evaluate
from tqdm.auto import tqdm
from progressbar import ProgressBar
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from dataset import load_datasets, get_dataloaders
from utils import save_model

2. Set environment variables

# OS HYPER GLOBAL PARAMETERS
warnings. filter warnings("ignore")
torch.backends.cudnn.enabled=True
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'#Set the graphics card used, I only have one card here
#The rest are for debugging bugs, and using cudnn settings

3. Model hyperparameters and save path settings

# - - - - - - - - - - - - - - - - - - - - - - - -
# SAVE MODEL SETTINGS
CHECK_POINT_PATH = "../model/embedding_model" # Here is the pre-trained ernie-zh-base3.0 used
BEST_MODEL_SAVE_DIR = "../output/model/best_model"# save the best model during training
LAST_MODEL_SAVE_DIR = "../output/model/last_model"# This is the last epoch save model
# - - - - - - - - - - - - - - - - - - - - - - - -
# MODEL HYPER PARAMETERS
LEARN_RATE = 5e-5 # learning rate
NUM_EPOCHS = 12 #epochs
BATCH_SIZE=16
NUM_LABELS = 4 # A total of four categories are required
# - - - - - - - - - - - - - - - - - - - - - - - - - -

4. Training & amp; Evaluation & amp; Saving

1. Preparation before training (the following codes are all under the main thread)

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH)
    train_dataloader, eval_dataloader = (get_dataloaders(tokenizer, BATCH_SIZE)['train'],
                                         get_dataloaders(tokenizer, BATCH_SIZE)['valid'])
    # load check point
    model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS)
    #define a optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARN_RATE)
    num_training_steps = len(train_dataloader) * NUM_EPOCHS
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    #evaluate settings
    metric = evaluate.load("metric.py", type="metric")
    best_accuracy = 0.0
    progress_bar = tqdm(range(num_training_steps))
    progress_bar.set_description('training')
    # model to device
    model.to(DEVICE)

If you want to use num_work multi-threading in dataloader to load data, you must go under if __name__ == “__main__”:! !

Here we first load the word segmenter and model, and then obtain the training set and validation set from the get_dataloaders function defined above.

Using Adamw as a hyperparameter optimizer

Define assessment

metric = evaluate.load(“metric.py”, type=”metric”)

Here I am loading the local evaluation file. In fact, you can also directly use evaluate.load(“metric”) to load it from huggingfacehub. However, my network is not good, so I copied it directly to the local one.

Here is a metric.py for everyone:

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Accuracy metric."""

import datasets
from sklearn.metrics import accuracy_score

import evaluate

_DESCRIPTION = """
Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative
"""

_KWARGS_DESCRIPTION = """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
        {'accuracy': 0.5}

    Example 2-The same as Example 1, except with `normalize` set to `False`.
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
        >>> print(results)
        {'accuracy': 3.0}

    Example 3-The same as Example 1, except with `sample_weight` set.
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
        >>> print(results)
        {'accuracy': 0.8778625954198473}
"""

_CITATION = """
@article{scikit-learn,
  title={Scikit-learn: Machine Learning in {P}ython},
  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
  journal={Journal of Machine Learning Research},
  volume={12},
  pages={2825--2830},
  year={2011}
}
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Accuracy(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("int32")),
                    "references": datasets.Sequence(datasets.Value("int32")),
                }
                if self.config_name == "multilabel"
                else {
                    "predictions": datasets.Value("int32"),
                    "references": datasets.Value("int32"),
                }
            ),
            reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
        )

    def _compute(self, predictions, references, normalize=True, sample_weight=None):
        return {
            "accuracy": float(
                accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
            )
        }

2. Training & amp;Evaluation & amp;Save

1) Training

 for epoch in range(NUM_EPOCHS):
        pbar = ProgressBar().start() #This is the start progress bar
        model. train()
        total_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss. backward()
            optimizer. step()
            lr_scheduler. step()
            optimizer. zero_grad()
            total_loss + = loss.item()
        average_loss = total_loss / len(train_dataloader) #Calculate the average loss of an epoch

2) Evaluation

 # noinspection DuplicatedCode
        model.eval()
        for batch in eval_dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs. logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])
        result = metric.compute()
        accuracy = result['accuracy']
        print(f"epoch: {epoch}, average_loss: {average_loss:.4f},accuracy: {result['accuracy']:.4f}")
        progress_bar. update(1)

3) Save the model

1. First define a save model function:

def save_model(tokenizer, model, save_dir):
    tokenizer. save_pretrained(save_dir)
    model. save_pretrained(save_dir)
    logging.info('save done')

2. Save the model code:

 if accuracy > best_accuracy:
            best_accuracy = accuracy
            save_model(tokenizer, model, BEST_MODEL_SAVE_DIR)
        save_model(tokenizer, model, LAST_MODEL_SAVE_DIR)
        pbar. finish()

4) Complete train.py

import os
import torch
import warnings
import evaluate
from tqdm.auto import tqdm
from progressbar import ProgressBar
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from dataset import load_datasets, get_dataloaders
from utils import save_model

# OS HYPER GLOBAL PARAMETERS
warnings. filter warnings("ignore")
torch.backends.cudnn.enabled=True
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
DEVICE = 'cuda'
# - - - - - - - - - - - - - - - - - - - - - - - -
# SAVE MODEL SETTINGS
CHECK_POINT_PATH = "../model/embedding_model"
BEST_MODEL_SAVE_DIR = "../output/model/best_model"
LAST_MODEL_SAVE_DIR = "../output/model/last_model"
# - - - - - - - - - - - - - - - - - - - - - - - -
# MODEL HYPER PARAMETERS
LEARN_RATE = 5e-5
NUM_EPOCHS = 12
BATCH_SIZE = 16
NUM_LABELS = 4
# - - - - - - - - - - - - - - - - - - - - - - - -


if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH)
    train_dataloader, eval_dataloader = (get_dataloaders(tokenizer, BATCH_SIZE)['train'],
                                         get_dataloaders(tokenizer, BATCH_SIZE)['valid'])
    # load check point
    model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS)
    # define a optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARN_RATE)
    num_training_steps = len(train_dataloader) * NUM_EPOCHS
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    # evaluate settings
    metric = evaluate.load("metric.py", type="metric")
    best_accuracy = 0.0
    progress_bar = tqdm(range(num_training_steps))
    progress_bar.set_description('training')
    # model to device
    model.to(DEVICE)
    # train & amp; eval & amp; save
    for epoch in range(NUM_EPOCHS):
        pbar = ProgressBar().start()
        model. train()
        total_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss. backward()
            optimizer. step()
            lr_scheduler.step()
            optimizer. zero_grad()
            total_loss + = loss.item()
        average_loss = total_loss / len(train_dataloader)

        # noinspection DuplicatedCode
        model.eval()
        for batch in eval_dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])
        result = metric.compute()
        accuracy = result['accuracy']
        print(f"epoch: {epoch}, average_loss: {average_loss:.4f}, accuracy: {result['accuracy']:.4f}")
        progress_bar.update(1)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            save_model(tokenizer, model, BEST_MODEL_SAVE_DIR)
        save_model(tokenizer, model, LAST_MODEL_SAVE_DIR)
        pbar.finish()

3. Prediction predict.py

This is similar to the training process, I wrote the evaluation directly, without the actual output label

import evaluate
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from dataset import get_dataloaders
from progressbar import ProgressBar

#DEFINE MODEL PATH
# -------------------------------------------------- -----
CHECK_POINT_PATH = '../output/model/best_model'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
DEVICE = 'cuda'
BATCH = 8
NUM_LABELS = 4
# -------------------------------------------------- -----
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH)
# load test data
test_dataloader = get_dataloaders(tokenizer, batch_size=BATCH)['test']
# init evaluate
accuracy = evaluate.load('metric.py', type='metric')
model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS)

if __name__ == '__main__':
    pbar = ProgressBar().start()
    model.to(DEVICE)
    # noinspection DuplicatedCode
    model.eval()
    for batch in test_dataloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        print(outputs.logits)
        predictions = torch.argmax(logits, dim=-1)
        accuracy.add_batch(predictions=predictions, references=batch["labels"])
    result = accuracy. compute()
    print(f"Accuracy: {result['accuracy']*100:.2f}%")
    pbar.finish()

This article is here, remember to pay attention