Skip to content

Commit

Permalink
Implement trie structure for tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
joshcarp committed Apr 23, 2024
1 parent a78a79a commit 78a4484
Show file tree
Hide file tree
Showing 11 changed files with 379 additions and 166 deletions.
2 changes: 1 addition & 1 deletion cmd/testgpt2/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func main() {
fmt.Printf("[State]\n")
fmt.Printf("batch_size: %d\n", B)
fmt.Printf("seq_len: %d\n", T)
fmt.Printf("num_activations: %d\n", model.NumActivations)
fmt.Printf("num_activations: %d\n", len(model.Acts.Memory))
allok := true
var losses []float32
for step := 0; step < 10; step++ {
Expand Down
15 changes: 15 additions & 0 deletions dataloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ func newDataLoader(file io.Reader, batchSize, seqLength int) (*DataLoader, error
return loader, nil
}

func newDataLoaderFromInts(data []int32, batchSize, seqLength int) (*DataLoader, error) {
size := len(data)
if size < (batchSize*seqLength + 1) {
return nil, errors.New("error: file size is too small for the batch size and sequence length")
}
loader := &DataLoader{
batchSize: batchSize,
seqLength: seqLength,
NumBatches: size / (batchSize * seqLength),
data: data,
fileSize: int64(size),
}
return loader, nil
}

func (loader *DataLoader) Reset() {
loader.currentPosition = 0
}
Expand Down
6 changes: 4 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ go 1.22.1
require github.com/stretchr/testify v1.9.0

require (
github.com/brianvoe/gofakeit v3.18.0+incompatible // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/kr/pretty v0.1.0 // indirect
github.com/kr/pretty v0.2.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
github.com/trailofbits/go-fuzz-utils v0.0.0-20230413173806-58c38daa3cb4 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
github.com/brianvoe/gofakeit v3.18.0+incompatible h1:wDOmHc9DLG4nRjUVVaxA+CEglKOW72Y5+4WNxUIkjM8=
github.com/brianvoe/gofakeit v3.18.0+incompatible/go.mod h1:kfwdRA90vvNhPutZWfH7WPaDzUjz+CZFqG+rPkOjGOc=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/trailofbits/go-fuzz-utils v0.0.0-20230413173806-58c38daa3cb4 h1:GpfJ7OdNjS7BFTVwNCUI9L4aCJOFRbr5fdHqjdhoYE8=
github.com/trailofbits/go-fuzz-utils v0.0.0-20230413173806-58c38daa3cb4/go.mod h1:f3jBhpWvuZmue0HZK52GzRHJOYHYSILs/c8+K2S/J+o=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
73 changes: 44 additions & 29 deletions gpt.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@ import (
"time"
)

const GPT2_EOT = 50256
const (
GPT2_EOT int32 = 50256
)

type GPT2Config struct {
MaxSeqLen int `json:"max_seq_len"`
V int `json:"vocab_size"`
L int `json:"num_layers"`
NH int `json:"num_heads"`
C int `json:"channels"`
EOT int32
}

type GPT2 struct {
Expand All @@ -25,20 +28,19 @@ type GPT2 struct {
// Params has the actual weights of the model. Params.Memory is for convenience to be able to set/reset parameters simply
Params ParameterTensors // Weights of the model
// Grads contains the delta/gradient that will eventually be applied to the params in the model
Grads ParameterTensors // Gradients of the weights
NumParameters int // Total number of parameters
Grads ParameterTensors // Gradients of the weights
// Fields for AdamW optimizer
MMemory []float32 // First moment estimates (for AdamW)
VMemory []float32 // Second moment estimates (for AdamW)
Acts ActivationTensors // Activations of the model
// gradients of the activations
GradsActs ActivationTensors
NumActivations int
B int // Current batch size (B)
T int // Current sequence length (T)
Inputs []int32 // Input tokens
Targets []int32 // Target tokens
MeanLoss float32 // Mean loss after a forward pass
GradsActs ActivationTensors
B int // Current batch size (B)
T int // Current sequence length (T)
Inputs []int32 // Input tokens
Targets []int32 // Target tokens
MeanLoss float32 // Mean loss after a forward pass
Rand *rand.Rand
}

// LoadGPT2Model loads the GPT-2 model from a checkpoint file.
Expand All @@ -65,6 +67,22 @@ func LoadGPT2Model(checkpointPath, tokenizerFile string) (*GPT2, error) {
return model, nil
}

func newGPT2(MaxSeqLen, V, L, NH, C int, vocab []string) GPT2 {
model := GPT2{
Config: GPT2Config{
MaxSeqLen: MaxSeqLen,
V: V,
L: L,
NH: NH,
C: C,
},
Params: newParameterTensors(V, C, MaxSeqLen, L),
Tokenizer: newTokenizer(vocab),
Rand: rand.New(rand.NewSource(21)),
}
return model
}

func loadFromReader(f io.Reader) (*GPT2, error) {
header := make([]int32, 256)
err := binary.Read(f, binary.LittleEndian, header)
Expand All @@ -81,10 +99,11 @@ func loadFromReader(f io.Reader) (*GPT2, error) {
L: int(header[4]),
NH: int(header[5]),
C: int(header[6]),
EOT: GPT2_EOT,
},
Rand: rand.New(rand.NewSource(21)),
}
model.Params.Init(model.Config.V, model.Config.C, model.Config.MaxSeqLen, model.Config.L)
model.NumParameters = len(model.Params.Memory)
if err := binary.Read(f, binary.LittleEndian, model.Params.Memory); err != nil {
return nil, fmt.Errorf("error reading model: %v", err)
}
Expand All @@ -99,7 +118,7 @@ func (model *GPT2) String() string {
s += fmt.Sprintf("num_layers: %d\n", model.Config.L)
s += fmt.Sprintf("num_heads: %d\n", model.Config.NH)
s += fmt.Sprintf("channels: %d\n", model.Config.C)
s += fmt.Sprintf("num_parameters: %d\n", model.NumParameters)
s += fmt.Sprintf("num_parameters: %d\n", len(model.Params.Memory))
return s
}

Expand Down Expand Up @@ -264,7 +283,6 @@ func (model *GPT2) Backward() error {
if len(model.Grads.Memory) == 0 {
model.Grads.Init(V, C, model.Config.MaxSeqLen, L)
model.GradsActs.Init(B, C, T, L, NH, V)
model.NumActivations = len(model.GradsActs.Memory)
model.ZeroGradient()
}
// backward pass
Expand Down Expand Up @@ -353,11 +371,11 @@ func (model *GPT2) Backward() error {
func (model *GPT2) Update(learningRate, beta1, beta2, eps, weightDecay float32, t int) {
// Lazy memory allocation
if model.MMemory == nil {
model.MMemory = make([]float32, model.NumParameters)
model.VMemory = make([]float32, model.NumParameters)
model.MMemory = make([]float32, model.Params.Len())
model.VMemory = make([]float32, model.Params.Len())
}
// Parameter updates
for i := 0; i < model.NumParameters; i++ {
for i := 0; i < model.Params.Len(); i++ {
parameter := model.Params.Memory[i]
gradient := model.Grads.Memory[i]
// Momentum update
Expand All @@ -374,8 +392,8 @@ func (model *GPT2) Update(learningRate, beta1, beta2, eps, weightDecay float32,
}
}

func (model *GPT2) Inference(input string) (string, error) {
B, T := 1, 16
func (model *GPT2) Inference(input string, B, T int) (string, error) {
//B, T := 1, 16
start := time.Now()
defer func() {
fmt.Printf("inference time took: %v\n", time.Now().Sub(start))
Expand All @@ -386,25 +404,23 @@ func (model *GPT2) Inference(input string) (string, error) {
}
if len(tokens) < T {
for i := len(tokens); i <= T; i++ {
tokens = append(tokens, GPT2_EOT)
tokens = append(tokens, model.Config.EOT)
}
}
fmt.Printf("input is %d tokens long\n", len(tokens))
model.Forward(tokens, tokens[1:], B, T)
genTokens := make([]int32, B*T)
const genMaxLength = 16
genTokens[0] = GPT2_EOT // the GPT-2 EOT token kicks off the generation
genTokens := make([]int32, model.Config.MaxSeqLen)
for i := 0; i < B*T; i++ {
genTokens[i] = GPT2_EOT
genTokens[i] = model.Config.EOT
}
for t := 1; t < genMaxLength; t++ {
for t := 1; t < model.Config.MaxSeqLen; t++ {
fmt.Printf("generating token: %d\n", t)
// for each t, we re-compute all activations between 0 and t
// leaving this alone because you want separate code for inference anyway
// the inference here is just for sanity checking purposes
model.Forward(genTokens, nil, B, t)
probabilities := model.Acts.Probabilities.data[(t-1)*model.Config.V:]
coin := rand.Float32()
coin := model.Rand.Float32()
nextToken2 := sampleMult(probabilities, coin)
genTokens[t] = rune(nextToken2)
}
Expand Down Expand Up @@ -433,12 +449,11 @@ func (model *GPT2) Train(valDataloader, trainDataloader *DataLoader, B, T int) e
valLoss /= float32(valNumBatches)
fmt.Printf("val loss %f\n", valLoss)
}
if true || step > 0 && step%20 == 0 {
if step > 0 && step%20 == 0 {
for i := 0; i < B*T; i++ {
genTokens[i] = GPT2_EOT
genTokens[i] = model.Config.EOT
}
genTokens[0] = GPT2_EOT // the GPT-2 EOT token kicks off the generation
for t := 1; t < genMaxLength; t++ {
for t := 1; t < len(genTokens); t++ {
// for each t, we re-compute all activations between 0 and t
// leaving this alone because you want separate code for inference anyway
// the inference here is just for sanity checking purposes
Expand Down
48 changes: 48 additions & 0 deletions gpt_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package llmgo

import (
"github.com/stretchr/testify/assert"
"testing"
)

func TestLoadGPT2Model(t *testing.T) {
tests := []struct {
name string
maxSeqLen int
v int
l int
nh int
c int
vocab []string
input string
output string
}{
{
name: "",
maxSeqLen: 3,
v: 3,
l: 2,
nh: 1,
c: 1,
vocab: []string{"a", "b", "c"},
input: "abcd",
output: "acc",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
B, T := 1, 2
model := newGPT2(tt.maxSeqLen, tt.v, tt.l, tt.nh, tt.c, tt.vocab)
tokens, err := model.Tokenizer.Encode(tt.input)
validation, err := newDataLoaderFromInts(tokens, B, T)
assert.NoError(t, err)
train, err := newDataLoaderFromInts(tokens, B, T)
assert.NoError(t, err)
err = model.Train(validation, train, B, T)
assert.NoError(t, err)
output, err := model.Inference(tt.input, 1, 2)
assert.NoError(t, err)
println(output)
})
}
}
34 changes: 1 addition & 33 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,5 @@ import (
"testing"
)

func TestSmallGPT(t *testing.T) {
B := 4
T := 64
model := GPT2{
Config: GPT2Config{
MaxSeqLen: 64,
V: 50257,
L: 2,
NH: 4,
C: 200,
},
}
model.Params.Init(model.Config.V, model.Config.C, model.Config.MaxSeqLen, model.Config.L)
model.NumParameters = len(model.Params.Memory)
var s float32
for i := range model.Params.Memory {
model.Params.Memory[i] = 0.001
s += model.Params.Memory[i]
}
dataloader, err := NewDataLoader("./data/tiny_shakespeare_val.bin", B, T)
if err != nil {
panic(err)
}
for i := 0; i < 10; i++ {
inp, tar, err := dataloader.NextBatch()
if err != nil {
panic(err)
}
model.Forward(inp, tar, B, T)
model.ZeroGradient()
model.Backward()
model.Update(1e-4, 0.9, 0.999, 1e-8, 0.0, i+1)
}
func TestGPT(t *testing.T) {
}
28 changes: 1 addition & 27 deletions math_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,32 +291,6 @@ func TestAttentionForward(t *testing.T) {
}
}

func TestAttentionBackward(t *testing.T) {
type args struct {
dinp []float32
dpreatt []float32
datt []float32
dout []float32
inp []float32
att []float32
B int
T int
C int
NH int
}
tests := []struct {
name string
args args
}{
// TODO: Add test cases.
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
attentionBackward(tt.args.dinp, tt.args.dpreatt, tt.args.datt, tt.args.dout, tt.args.inp, tt.args.att, tt.args.B, tt.args.T, tt.args.C, tt.args.NH)
})
}
}

func FuzzGeluInverse(f *testing.F) {
for i := 0; i < 1000; i++ {
// Generate random input values
Expand Down Expand Up @@ -354,7 +328,7 @@ func TestInference(t *testing.T) {
randomText := "Kathleen Mary Ferrier CBE (22 April 1912 – 8 October 1953)[1] was an English contralto singer who achieved an international reputation as a stage, concert and recording artist, with a repertoire extending from folksong and popular ballads to the classical works of Bach, Brahms, Mahler and Elgar. Her death from cancer, at the height of her fame, was a shock to the musical world and particularly to the general public, which was kept in ignorance of the nature of her illness until after her death. The daughter of a Lancashire village schoolmaster, Ferrier showed early talent as a pianist, and won numerous amateur piano competitions while working as a telephonist with the General Post Office. She did not take up singing seriously until 1937, when after winning a prestigious singing competition at the Carlisle Festival she began to receive offers of professional engagements as a vocalist. Thereafter she took singing lessons, first with J. E. Hutchinson and later with Roy Henderson. After the outbreak of the Second World War Ferrier was recruited by the Council for the Encouragement of Music and the Arts (CEMA), and in the following years sang at concerts and recitals throughout the UK. In 1942 her career was boosted when she met the conductor Malcolm Sargent, who recommended her to the influential Ibbs and Tillett concert management agency. She became a regular performer at leading London and provincial venues, and made numerous BBC radio broadcasts. In 1946 Ferrier made her stage debut in the Glyndebourne Festival premiere of Benjamin Britten's opera The Rape of Lucretia. A year later she made her first appearance as Orfeo in Gluck's Orfeo ed Euridice, a work with which she became particularly associated. By her own choice, these were her only two operatic roles. As her reputation grew, Ferrier formed close working relationships with major musical figures, including Britten, Sir John Barbirolli, Bruno Walter and the accompanist Gerald Moore. She became known internationally through her three tours to the United States between 1948 and 1950 and her many visits to continental Europe. Ferrier was diagnosed with breast cancer in March 1951. In between periods of hospitalisation and convalescence she continued to perform and record; her final public appearance was as Orfeo, at the Royal Opera House in February 1953, eight months before her death. Among her many memorials, the Kathleen Ferrier Cancer Research Fund was launched in May 1954. The Kathleen Ferrier Scholarship Fund, administered by the Royal Philharmonic Society, has since 1956 made annual awards to aspiring young professional singers."
model, err := LoadGPT2Model("./gpt2_124M.bin", "./gpt2_tokenizer.bin")
require.NoError(t, err)
output, err := model.Inference(randomText)
output, err := model.Inference(randomText, 1, 1)
require.NoError(t, err)
t.Log(output)
}
10 changes: 10 additions & 0 deletions tensor.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,16 @@ type ParameterTensors struct {
LayerFinNormB tensor // (C) - Final layer normalization biases
}

func newParameterTensors(V, C, maxSeqLen, L int) ParameterTensors {
var tensor ParameterTensors
tensor.Init(V, C, maxSeqLen, L)
return tensor
}

func (tensor *ParameterTensors) Len() int {
return len(tensor.Memory)
}

// Init initialises the ParameterTensors with specific sizes for each tensor based on the model architecture.
func (tensor *ParameterTensors) Init(V, C, maxSeqLen, L int) {
tensor.Memory = make([]float32,
Expand Down
Loading

0 comments on commit 78a4484

Please sign in to comment.