main.py


import matplotlib.pylab as plt
import torch
from torch.nn import functional as F


device = 'mps'

with open("data.txt", "r", encoding='utf-8') as f:
    text = f.read()

text = text.lower()
chars = sorted(list(set(text)))
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

data = [stoi[c] for c in text]
vocab_size = len(chars)

ins = 64
outs = vocab_size
nodes = 200
lr = 0.003
n_emb = 64

embed = torch.randn(vocab_size, n_emb)
pos = torch.randn(ins, n_emb)

# embed = embed.to(device)
# pos = pos.to(device)
data = torch.tensor(data).long()
params = []


def weights(ins, outs):
    ws = torch.randn(ins, outs) * 0.1
    ws.to(device)
    ws.requires_grad_(True)
    params.append(ws)
    return ws


class Head():
    def __init__(self):
        self.wv = weights(n_emb, n_emb//4)
        self.wq = weights(n_emb, n_emb//4)
        self.wk = weights(n_emb, n_emb//4)
        self.wr = weights(n_emb, ins)

    def forward(self, x):
        v = x @ self.wv
        q = x @ self.wq
        k = x @ self.wk

        attention = (q @ k.transpose(-2, -1)) / k.shape[0]**0.5
        # re_weight = x @ self.wr

        tril = torch.tril(attention)
        tril = tril.masked_fill(tril == 0, -1e10)
        rew = F.softmax(tril, dim=-1)
        x = rew @ v
        return x


class Block():
    def __init__(self):
        self.heads = [Head(), Head(), Head(), Head()]
        self.w0 = weights(n_emb, nodes)
        self.w1 = weights(nodes, n_emb)

    def forward(self, x):
        x = torch.cat([head.forward(x) for head in self.heads], dim=-1)
        x = torch.relu(x @ self.w0)
        x = torch.relu(x @ self.w1)
        return x


class Model():
    def __init__(self):
        self.blocks = [Block(), Block(), Block(), Block()]
        self.w2 = weights(n_emb, outs)

    def forward(self, x):
        x = embed[x] + pos
        x = x + self.blocks[0].forward(x)
        x = x + self.blocks[1].forward(x)
        x = x + self.blocks[2].forward(x)

        yh = (x @ self.w2)
        return yh


model = Model()
optimizer = torch.optim.Adam(params, lr)
ers = []

for i in range(5000):
    b = torch.randint(len(data) - ins, (100,))
    xs = torch.stack([data[i:i+ins] for i in b])
    ys = torch.stack([data[i+1:i+ins+1] for i in b])

    # xs = xs.to(device)
    # ys = ys.to(device)

    yh = model.forward(xs)

    loss = F.cross_entropy(yh.view(-1, vocab_size), ys.long().view(-1))
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()
    e = loss.item()
    if (i % 50 == 0):
        print(i, "Loss", e)
    ers.append(e)

plt.figure(1)
plt.plot(ers)

plt.figure(2)
plt.plot(ys)

yh = torch.argmax(yh, dim=-1)
plt.plot(yh.detach())


s = xs[0]
temperature = 0.8

gen_text = ""
for i in range(3000):
    yh = model.forward(s)
    prob = F.softmax(yh[-1, :] * temperature, dim=0)
    # pred = torch.argmax(yh).item()
    pred = torch.multinomial(prob, num_samples=1).item()

    s = torch.roll(s, -1)
    s[-1] = pred

    gen_text += itos[pred]


print(gen_text)
plt.show()