Practical combat of entity recognition based on bert+bilstm+crf based on torch framework

First, we need to import the required libraries:

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel

Then define some hyperparameters and model structure:

# Hyperparameters
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_model = BertModel.from_pretrained('bert-base-chinese')

class EntityModel(nn.Module):
    def __init__(self, bert_model, hidden_size, num_tags):
        super(EntityModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.bilstm = nn.LSTM(bidirectional=True, input_size=hidden_size, hidden_size=hidden_size // 2, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_tags)
        self.crf = CRF(num_tags)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, _ = self.bilstm(sequence_output)
        logits = self.fc(lstm_output)
        if labels is not None:
            loss = -self.crf(logits, labels, mask=attention_mask.byte())
            return loss
        else:
            tags = self.crf.decode(logits, mask=attention_mask.byte())
            return tags

Here, we use the BERT model and BiLSTM layer to extract the features of the sentence, then map them to the label space through a fully connected layer, and use a CRF layer to model the label sequence.

Next, we need to define some helper functions:

def tokenize_and_preserve_labels(text, labels):
    tokenized_text = []
    token_labels = []
    for word, label in zip(text, labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_text.extend(tokenized_word)
        token_labels.extend([label] * n_subwords)

    return tokenized_text, token_labels

def pad_sequences(sequences, max_len, padding_value=0):
    padded_sequences = torch.zeros((len(sequences), max_len)).long()
    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        if seq_len <= max_len:
            padded_sequences[i, :seq_len] = torch.tensor(seq)
        else:
            padded_sequences[i, :] = torch.tensor(seq[:max_len])
    return padded_sequences

def train(model, optimizer, train_dataloader):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss = model(input_ids, attention_mask, labels)
        total_loss + = loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    return avg_train_loss

def evaluate(model, eval_dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            loss = model(input_ids, attention_mask, labels)
            total_loss + = loss.item()

    avg_eval_loss = total_loss / len(eval_dataloader)
    return avg_eval_loss

def predict(model, text):
    model.eval()
    tokenized_text = tokenizer.tokenize(text)
    tokenized_text_with_labels = [(token, 'O') for token in tokenized_text]
    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokenized_text)])
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        tags = model(input_ids.to(device), attention_mask.to(device))

    tag_labels = [id2label[tag] for tag in tags[0]]
    return list(zip(tokenized_text, tag_labels))

Here we define a tokenization function that converts raw text and tags into tokenized sequences of text and tags. We also define a padding function that pads the sequences so that they can be batched. We then defined the training, evaluation and prediction functions.

Next, we need to load the dataset and convert it into the format required by the model:

# Load data set
train_data = []
with open('train.txt', 'r', encoding='utf-8') as f:
    words = []
    labels = []
    for line in f:
        line = line.strip()
        if line == '':
            train_data.append((words, labels))
            words = []
            labels = []
        else:
            word, label = line.split()
            words.append(word)
            labels.append(label)

if len(words) > 0:
    train_data.append((words, labels))

# Convert the dataset to the format required by the model
train_input_ids = []
train_attention_masks = []
train_labels = []

for words, labels in train_data:
    tokenized_text, token_labels = tokenize_and_preserve_labels(words, labels)
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    attention_mask = [1] * len(input_ids)

    train_input_ids.append(input_ids)
    train_attention_masks.append(attention_mask)
    train_labels.append([label2id[label] for label in token_labels])

train_input_ids = pad_sequences(train_input_ids, MAX_LEN)
train_attention_masks = pad_sequences(train_attention_masks, MAX_LEN)
train_labels = pad_sequences(train_labels, MAX_LEN, padding_value=-1)

train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Similarly, we also need to load the validation set and test set and convert them into the format required by the model

Here we load a file containing the training data and convert it into the format required by the model. We use tokenization and padding functions to achieve this.

Finally, we can use the above helper functions and datasets to train, evaluate, and test the model:

# Training model
model = EntityModel(bert_model, hidden_size=768, num_tags=len(label2id))
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    avg_train_loss = train(model, optimizer, train_dataloader)
    avg_eval_loss = evaluate(model, eval_dataloader)
    print(f'Epoch {epoch + 1}: train_loss={avg_train_loss:.4f}, eval_loss={avg_eval_loss:.4f}')

# Test model
test_sentences = ['Today is a good day', 'I like Chinese food', 'Paris is a beautiful city']
for sentence in test_sentences:
    tags = predict(model, sentence)
    print(tags)

Here, we use Adam optimizer and cross-entropy loss function to train the model. We then use the test set to evaluate the model’s performance and use the model to predict entities in some new sentences.