numpy implements decoder version of gpt character training process

The first picture is the complete process, the second picture is the decoder, and the third picture is the encoder.

The first thing given is the prediction of characters, and the prediction of articles is still being trained.

The corresponding website is this:

numpy_transformer/gpt_character at master · ZouJiu1/numpy_transformer

import os
abspath = os.path.abspath(__file__)
filename = os.sep.join(abspath.split(os.sep)[-2:])
abspath = abspath.replace(filename, "")

from net.loss import cross_entropy_loss
import numpy as np
import pickle
from net.layernorm import layer_norm
from PatchEmbed import Position_Embedding
from gpt.attdecoderblock import attdecoderblock_layer
from net.layernorm import layer_norm
from net.fullconnect import fclaer

from copy import deepcopy
import json


def getdata():
    dataset = os.path.join(abspath, 'dataset')
    os.makedirs(dataset, exist_ok=True)
    id2char_char2id = os.path.join(abspath, 'dataset', r"alphabet.json")
    # inpath = os.path.join(abspath, 'dataset', r"alphabet.txt")
    # with open(inpath, 'r', encoding='utf-8') as obj:
    # readcontent =
    readcontent = \
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz
    kk = [i if i!='\
' else " " for i in readcontent]
    unique = np.unique(kk)
    length = len(unique)
    id2char = {i:char for i, char in enumerate(unique)}
    char2id = {char:i for i, char in enumerate(unique)}
    if not os.path.exists(id2char_char2id):
        with open(id2char_char2id, 'w', encoding='utf-8') as obj:
            json.dump({"id2char":id2char, 'char2id':char2id}, obj, indent=2, separators=(",", ":"), ensure_ascii=False)
        with open(id2char_char2id, 'r', encoding='utf-8') as obj:
            jsonfile = json.load(obj)
        id2chark = jsonfile["id2char"]
        char2id = jsonfile["char2id"]
        length = len(id2char)
        id2char = {}
        for key, value in id2chark.items():
            id2char[int(key)] = value
    return length, id2char, char2id, kk

def create_masks(input_mask):
    input_mask = np.array(input_mask)
    n, sequence_length = input_mask.shape
    k1 = input_mask[:, None, :]
    k2 = np.ones_like(input_mask)[:, :, None]
    k = k1 * k2
    k = (1.0 - k) * (-1e6)

    input_mask = np.tril(np.ones_like(k))
    input_mask[input_mask==0] = -np.inf
    input_mask[input_mask==1] = 0
    return input_mask

def transformer_image_train(num_classes):
    vocab_size, id2char, char2id, input_texts = getdata()

    epoch = 10000
    learning_rate = 0.0002
    #embed_dim = 210
    embed_dim = 27
    # num_layer = 12
    num_layer = 1
    num_h = [3] * num_layer
    # context_length = 260
    context_length = 10

    logfile = os.path.join(logdir, 'log_gpt_alphabet.txt')
    fpwrite = open(logfile, 'w', encoding='utf-8')

    patchemb = Position_Embedding(context_length, vocab_size, embed_dim)
    layers = [patchemb]
    at0 = attdecoderblock_layer(embed_dim, num_h[0])
    # at1 = attdecoderblock_layer(embed_dim, num_h[1])
    # at2 = attdecoderblock_layer(embed_dim, num_h[2])
    # at3 = attdecoderblock_layer(embed_dim, num_h[3])
    # at4 = attdecoderblock_layer(embed_dim, num_h[4])
    # at5 = attdecoderblock_layer(embed_dim, num_h[5])
    # at6 = attdecoderblock_layer(embed_dim, num_h[6])
    # at7 = attdecoderblock_layer(embed_dim, num_h[7])
    # at8 = attdecoderblock_layer(embed_dim, num_h[8])
    # at9 = attdecoderblock_layer(embed_dim, num_h[9])
    # at10 = attdecoderblock_layer(embed_dim, num_h[10])
    # at11 = attdecoderblock_layer(embed_dim, num_h[11])
    # at12 = attdecoderblock_layer(embed_dim, num_h[12])
    # at13 = attdecoderblock_layer(embed_dim, num_h[13])
    layers + = [at0]
    # layers + = [at0, at1, at2, at3, at4, at5, at6, at7, at8, at9, at10, at11, at12, at13]

    norm = layer_norm(embed_dim)
    cll = fclayer(embed_dim, vocab_size, True)
    layers + = [norm, cll]

    datapath = os.path.join(abspath, 'dataset')
    os.makedirs(datapath, exist_ok=True)
    modelpath = os.path.join(abspath, 'gpt_character', 'model')
    os.makedirs(modelpath, exist_ok=True)

    if os.path.exists(pretrained_model):
        with open(pretrained_model, 'rb') as obj:
            models = pickle.load(obj)
        cnt = 0
        for l in layers:
            k = dir(l)
            if 'restore_model' in k and 'save_model' in k:
                cnt + = 1
        del models

    alliter = 0
    lr = learning_rate
    start_epoch = 1
        if os.path.exists(pretrained_model):
            start_epoch = int(pretrained_model.split(os.sep)[-1].split("_")[3]) + 1
        start_epoch = 1
    for i in range(start_epoch, epoch + 1):
        meanloss = 0
        # if i!=0:
            # lr = lr * dot
        if i==20*epoch//30:
            lr = learning_rate * 0.1
        elif i==26*epoch//30:
            lr = learning_rate * 0.1 * 0.1
        number = 0
        jk = 0
        pre_col = []
        while True:
            # if alliter < 99:
            # lr = 0.0001
            # elif alliter == 100:
            # lr = learning_rate
            jk + = 1
            inputs = []
            label = []
            input_mask = []
            for ij in range(batchsize):
                if number + context_length + 1 >= len(input_texts):
                tmp = [char2id[input_texts[ci + number]] for ci in range(context_length + 1)]
                inputchar = "".join([id2char[ci] for ci in tmp])
                # input_mask.append([1 for ci in range(context_length-1)])
                # input_mask[-1].extend([0])
                number + = context_length + 1
            if status > 0:
            alliter + = 1
            inputs = np.array(inputs)
            if len(input_mask)==0:
                input_mask = np.ones_like(inputs)
            input_mask = create_masks(input_mask)
            label_single = np.array(label).reshape(-1)
            for l in range(len(layers)):
                if isinstance(layers[l], attdecoderblock_layer):
                    inputs = layers[l].forward(inputs, input_mask)
                    inputs = layers[l].forward(inputs)

            ishape = inputs.shape
            inputs = np.reshape(inputs, (-1, vocab_size))
            labels = np.zeros_like(inputs)
            labels[np.arange(len(inputs)), label_single] = 1
            # k = np.sum(labels, axis = -1)
            loss, delta, predict = cross_entropy_loss(inputs, labels)
            if loss < 0.06:
                k = 0
            delta = np.reshape(delta, ishape)
            meanloss + = loss
            p = np.argmax(predict, axis=-1)
            precision = np.sum(label_single==p) / len(label_single)

            fpwrite.write("epoch:{}, lr: {:.6f}, loss: {:.6f}, iters: {}, precision: {:.6f}\
". \
                    format(i, lr, loss, str(jk) + "_" + str(alliter), precision))
            for l in range(len(layers)-1, -1, -1):
                delta = layers[l].backward(delta)
        meanloss /= jk
        premean = np.mean(pre_col)
        inputs = np.random.randint(0, vocab_size, (1, 1))
        output = deepcopy(inputs)
        for ij in range(context_length - 1):
            text = deepcopy(inputs)
            for l in range(len(layers)):
                inputs = layers[l].forward(inputs)
            inputs = np.reshape(inputs, (-1, vocab_size))
            out = inputs - np.max(inputs, axis = -1)[..., np.newaxis] # avoid too large in exp
            softmax = np.exp(out) / np.sum(np.exp(out), axis = -1)[:, np.newaxis]
            out = np.argmax(softmax, axis = -1)
            out = np.expand_dims(out, (-1))
            inputs = out.copy()
            output = np.concatenate([output, out], axis = -1)
        output = [id2char[int(ij)] for ij in output[0]]
        fpwrite.write("epoch: {}, {}\
".format(i, ''.join(output[:200])))

    # savemodel
    allmodel = []
    for l in layers:
        k = dir(l)
        if 'restore_model' in k and 'save_model' in k:
    name = "gpt_english_epoch_" + str(i) + "_loss_" + str(round(meanloss, 6)) + ".pkl"

    with open(os.path.join(modelpath, name), 'wb') as obj:
        pickle.dump(allmodel, obj)

if __name__ == "__main__":
    savepath = abspath
    pretrained_model = r''
    logdir = os.path.join(savepath, 'gpt', 'log')
    os.makedirs(logdir, exist_ok=True)

