1. Preparation of data set
First prepare our own data set, I asked chatgpt to help generate some
{ "title":"Zun Du Fake Du", "data": [{"text": "I love black silk beauties","labels": 2}, {"text": "I love white silk beauty","labels": 1}, {"text": "Black silk beauty is really sexy","labels": 2}, {"text": "White silk beauty is also very charming","labels": 1}, {"text": "Net stockings make beautiful legs more attractive","labels": 0}, {"text": "Black silk and white silk are both good-looking","labels": 3}, {"text": "The beauty in black silk makes my heart move","labels": 3}, {"text": "The beauty with white silk makes me unable to bear to look at it","labels": 1}, {"text": "Which one looks better, black silk or white silk?","labels": 3}, {"text": "I like girls in black silk","labels": 2}, {"text": "I think white silk is more suitable for me","labels": 1}, {"text": "Black silk and white silk have different charms","labels": 3}] }
The file name is dummydata and the label here is like this:
0 | Net socks |
1 | White silk |
2 | Black Silk |
3 | white silk + black silk |
1. Import the packages used
import evaluate import torch.utils.data from datasets import load_dataset, DatasetDict, Dataset from transformers import DataCollatorWithPadding, AutoTokenizer from torch.utils.data import DataLoader
2. Use the Datasets library to load the data set
1. Read custom json data set file
This is the location of my json file: F:\bert intent recognition\data\dummydata.jsonl
def load_datasets(test_size: float = 0.2) -> DatasetDict[str, Dataset]: assert 0 < test_size < 1, 'value must in range (0-1)' data = load_dataset('json', data_files='../data/dummydata.jsonl', field='data') train_test_valid = data['train'].train_test_split(test_size=0.1) dataset = DatasetDict({ "train": train_test_valid["train"], "test": train_test_valid["test"], "valid": train_test_valid["train"]}) return dataset
Here it is set up to automatically divide the data set into a training set and a test set according to a ratio of 0.1.
Then dataset is a DatasetDict class which contains training set, test set, and verification set (the content of the verification set is the same as the training set)
2. Process and return dataloader
def get_dataloaders(tokenizer, batch_size) -> dict[str:torch.utils.data.DataLoader]: #Here we set the tokenizer to automatically change the data set to index and padding. The returned format is pt. tokenize_func = lambda x: tokenizer(x["text"], padding=True, truncation=True, return_tensors="pt") #The load_datasets here is the previous function dataset = load_datasets() #Use the map in the class function to operate the dataset tokenized_datasets = dataset.map(tokenize_func, batched=True) #This remove is to remove the text in the data set and only retain the parameters that the bert class can accept. tokenized_datasets = tokenized_datasets.remove_columns(["text"]) #pytorch used here tokenized_datasets.set_format("torch") # collect a Dataloader data_collator = DataCollatorWithPadding(tokenizer=tokenizer) train_dataloader = DataLoader( tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator ) eval_dataloader = DataLoader( tokenized_datasets["valid"], batch_size=batch_size, collate_fn=data_collator ) test_dataloader = DataLoader( tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator ) return { "train": train_dataloader, "valid": eval_dataloader, "test": test_dataloader
This function returns three iterable Dataloaders in the torch that can be directly used for training and other operations.
2. Training train.py
1. Import the packages that need to be used
import os import torch import warnings import evaluate from tqdm.auto import tqdm from progressbar import ProgressBar from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler from dataset import load_datasets, get_dataloaders from utils import save_model
2. Set environment variables
# OS HYPER GLOBAL PARAMETERS warnings. filter warnings("ignore") torch.backends.cudnn.enabled=True os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' os.environ["TORCH_USE_CUDA_DSA"] = "1" os.environ["CUDA_LAUNCH_BLOCKING"] = "1" os.environ['CUDA_VISIBLE_DEVICES'] = '0'#Set the graphics card used, I only have one card here #The rest are for debugging bugs, and using cudnn settings
3. Model hyperparameters and save path settings
# - - - - - - - - - - - - - - - - - - - - - - - - # SAVE MODEL SETTINGS CHECK_POINT_PATH = "../model/embedding_model" # Here is the pre-trained ernie-zh-base3.0 used BEST_MODEL_SAVE_DIR = "../output/model/best_model"# save the best model during training LAST_MODEL_SAVE_DIR = "../output/model/last_model"# This is the last epoch save model # - - - - - - - - - - - - - - - - - - - - - - - - # MODEL HYPER PARAMETERS LEARN_RATE = 5e-5 # learning rate NUM_EPOCHS = 12 #epochs BATCH_SIZE=16 NUM_LABELS = 4 # A total of four categories are required # - - - - - - - - - - - - - - - - - - - - - - - - - -
4. Training & amp; Evaluation & amp; Saving
1. Preparation before training (the following codes are all under the main thread)
if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH) train_dataloader, eval_dataloader = (get_dataloaders(tokenizer, BATCH_SIZE)['train'], get_dataloaders(tokenizer, BATCH_SIZE)['valid']) # load check point model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS) #define a optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=LEARN_RATE) num_training_steps = len(train_dataloader) * NUM_EPOCHS lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, ) #evaluate settings metric = evaluate.load("metric.py", type="metric") best_accuracy = 0.0 progress_bar = tqdm(range(num_training_steps)) progress_bar.set_description('training') # model to device model.to(DEVICE)
If you want to use num_work multi-threading in dataloader to load data, you must go under if __name__ == “__main__”:! !
Here we first load the word segmenter and model, and then obtain the training set and validation set from the get_dataloaders function defined above.
Using Adamw as a hyperparameter optimizer
Define assessment
metric = evaluate.load(“metric.py”, type=”metric”)
Here I am loading the local evaluation file. In fact, you can also directly use evaluate.load(“metric”) to load it from huggingfacehub. However, my network is not good, so I copied it directly to the local one.
Here is a metric.py for everyone:
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Accuracy metric.""" import datasets from sklearn.metrics import accuracy_score import evaluate _DESCRIPTION = """ Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: Accuracy = (TP + TN) / (TP + TN + FP + FN) Where: TP: True positive TN: True negative FP: False positive FN: False negative """ _KWARGS_DESCRIPTION = """ Args: predictions (`list` of `int`): Predicted labels. references (`list` of `int`): Ground truth labels. normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. sample_weight (`list` of `float`): Sample weights Defaults to None. Returns: accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy. Examples: Example 1-A simple example >>> accuracy_metric = evaluate.load("accuracy") >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) >>> print(results) {'accuracy': 0.5} Example 2-The same as Example 1, except with `normalize` set to `False`. >>> accuracy_metric = evaluate.load("accuracy") >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) >>> print(results) {'accuracy': 3.0} Example 3-The same as Example 1, except with `sample_weight` set. >>> accuracy_metric = evaluate.load("accuracy") >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) >>> print(results) {'accuracy': 0.8778625954198473} """ _CITATION = """ @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} } """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class Accuracy(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features( { "predictions": datasets.Sequence(datasets.Value("int32")), "references": datasets.Sequence(datasets.Value("int32")), } if self.config_name == "multilabel" else { "predictions": datasets.Value("int32"), "references": datasets.Value("int32"), } ), reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"], ) def _compute(self, predictions, references, normalize=True, sample_weight=None): return { "accuracy": float( accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight) ) }
2. Training & amp;Evaluation & amp;Save
1) Training
for epoch in range(NUM_EPOCHS): pbar = ProgressBar().start() #This is the start progress bar model. train() total_loss = 0 for batch in train_dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} outputs = model(**batch) loss = outputs.loss loss. backward() optimizer. step() lr_scheduler. step() optimizer. zero_grad() total_loss + = loss.item() average_loss = total_loss / len(train_dataloader) #Calculate the average loss of an epoch
2) Evaluation
# noinspection DuplicatedCode model.eval() for batch in eval_dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} with torch.no_grad(): outputs = model(**batch) logits = outputs. logits predictions = torch.argmax(logits, dim=-1) metric.add_batch(predictions=predictions, references=batch["labels"]) result = metric.compute() accuracy = result['accuracy'] print(f"epoch: {epoch}, average_loss: {average_loss:.4f},accuracy: {result['accuracy']:.4f}") progress_bar. update(1)
3) Save the model
1. First define a save model function:
def save_model(tokenizer, model, save_dir): tokenizer. save_pretrained(save_dir) model. save_pretrained(save_dir) logging.info('save done')
2. Save the model code:
if accuracy > best_accuracy: best_accuracy = accuracy save_model(tokenizer, model, BEST_MODEL_SAVE_DIR) save_model(tokenizer, model, LAST_MODEL_SAVE_DIR) pbar. finish()
4) Complete train.py
import os import torch import warnings import evaluate from tqdm.auto import tqdm from progressbar import ProgressBar from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler from dataset import load_datasets, get_dataloaders from utils import save_model # OS HYPER GLOBAL PARAMETERS warnings. filter warnings("ignore") torch.backends.cudnn.enabled=True os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' os.environ["TORCH_USE_CUDA_DSA"] = "1" os.environ["CUDA_LAUNCH_BLOCKING"] = "1" os.environ['CUDA_VISIBLE_DEVICES'] = '0' DEVICE = 'cuda' # - - - - - - - - - - - - - - - - - - - - - - - - # SAVE MODEL SETTINGS CHECK_POINT_PATH = "../model/embedding_model" BEST_MODEL_SAVE_DIR = "../output/model/best_model" LAST_MODEL_SAVE_DIR = "../output/model/last_model" # - - - - - - - - - - - - - - - - - - - - - - - - # MODEL HYPER PARAMETERS LEARN_RATE = 5e-5 NUM_EPOCHS = 12 BATCH_SIZE = 16 NUM_LABELS = 4 # - - - - - - - - - - - - - - - - - - - - - - - - if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH) train_dataloader, eval_dataloader = (get_dataloaders(tokenizer, BATCH_SIZE)['train'], get_dataloaders(tokenizer, BATCH_SIZE)['valid']) # load check point model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS) # define a optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=LEARN_RATE) num_training_steps = len(train_dataloader) * NUM_EPOCHS lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, ) # evaluate settings metric = evaluate.load("metric.py", type="metric") best_accuracy = 0.0 progress_bar = tqdm(range(num_training_steps)) progress_bar.set_description('training') # model to device model.to(DEVICE) # train & amp; eval & amp; save for epoch in range(NUM_EPOCHS): pbar = ProgressBar().start() model. train() total_loss = 0 for batch in train_dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} outputs = model(**batch) loss = outputs.loss loss. backward() optimizer. step() lr_scheduler.step() optimizer. zero_grad() total_loss + = loss.item() average_loss = total_loss / len(train_dataloader) # noinspection DuplicatedCode model.eval() for batch in eval_dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} with torch.no_grad(): outputs = model(**batch) logits = outputs.logits predictions = torch.argmax(logits, dim=-1) metric.add_batch(predictions=predictions, references=batch["labels"]) result = metric.compute() accuracy = result['accuracy'] print(f"epoch: {epoch}, average_loss: {average_loss:.4f}, accuracy: {result['accuracy']:.4f}") progress_bar.update(1) if accuracy > best_accuracy: best_accuracy = accuracy save_model(tokenizer, model, BEST_MODEL_SAVE_DIR) save_model(tokenizer, model, LAST_MODEL_SAVE_DIR) pbar.finish()
3. Prediction predict.py
This is similar to the training process, I wrote the evaluation directly, without the actual output label
import evaluate import os import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification from dataset import get_dataloaders from progressbar import ProgressBar #DEFINE MODEL PATH # -------------------------------------------------- ----- CHECK_POINT_PATH = '../output/model/best_model' os.environ['CUDA_VISIBLE_DEVICES'] = '0' DEVICE = 'cuda' BATCH = 8 NUM_LABELS = 4 # -------------------------------------------------- ----- # load tokenizer tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH) # load test data test_dataloader = get_dataloaders(tokenizer, batch_size=BATCH)['test'] # init evaluate accuracy = evaluate.load('metric.py', type='metric') model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS) if __name__ == '__main__': pbar = ProgressBar().start() model.to(DEVICE) # noinspection DuplicatedCode model.eval() for batch in test_dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} with torch.no_grad(): outputs = model(**batch) logits = outputs.logits print(outputs.logits) predictions = torch.argmax(logits, dim=-1) accuracy.add_batch(predictions=predictions, references=batch["labels"]) result = accuracy. compute() print(f"Accuracy: {result['accuracy']*100:.2f}%") pbar.finish()
This article is here, remember to pay attention