Implement trie structure for tokenization

joshcarp · Apr 23, 2024 · 78a4484 · 78a4484
1 parent a78a79a
commit 78a4484
Show file tree

Hide file tree

Showing 11 changed files with 379 additions and 166 deletions.
diff --git a/cmd/testgpt2/main.go b/cmd/testgpt2/main.go
@@ -54,7 +54,7 @@ func main() {
 	fmt.Printf("[State]\n")
 	fmt.Printf("batch_size: %d\n", B)
 	fmt.Printf("seq_len: %d\n", T)
-	fmt.Printf("num_activations: %d\n", model.NumActivations)
+	fmt.Printf("num_activations: %d\n", len(model.Acts.Memory))
 	allok := true
 	var losses []float32
 	for step := 0; step < 10; step++ {

diff --git a/dataloader.go b/dataloader.go
@@ -50,6 +50,21 @@ func newDataLoader(file io.Reader, batchSize, seqLength int) (*DataLoader, error
 	return loader, nil
 }
 
+func newDataLoaderFromInts(data []int32, batchSize, seqLength int) (*DataLoader, error) {
+	size := len(data)
+	if size < (batchSize*seqLength + 1) {
+		return nil, errors.New("error: file size is too small for the batch size and sequence length")
+	}
+	loader := &DataLoader{
+		batchSize:  batchSize,
+		seqLength:  seqLength,
+		NumBatches: size / (batchSize * seqLength),
+		data:       data,
+		fileSize:   int64(size),
+	}
+	return loader, nil
+}
+
 func (loader *DataLoader) Reset() {
 	loader.currentPosition = 0
 }

diff --git a/go.mod b/go.mod
@@ -5,9 +5,11 @@ go 1.22.1
 require github.com/stretchr/testify v1.9.0
 
 require (
+	github.com/brianvoe/gofakeit v3.18.0+incompatible // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/kr/pretty v0.1.0 // indirect
+	github.com/kr/pretty v0.2.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
-	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
+	github.com/trailofbits/go-fuzz-utils v0.0.0-20230413173806-58c38daa3cb4 // indirect
+	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/go.sum b/go.sum
@@ -1,16 +1,24 @@
+github.com/brianvoe/gofakeit v3.18.0+incompatible h1:wDOmHc9DLG4nRjUVVaxA+CEglKOW72Y5+4WNxUIkjM8=
+github.com/brianvoe/gofakeit v3.18.0+incompatible/go.mod h1:kfwdRA90vvNhPutZWfH7WPaDzUjz+CZFqG+rPkOjGOc=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/trailofbits/go-fuzz-utils v0.0.0-20230413173806-58c38daa3cb4 h1:GpfJ7OdNjS7BFTVwNCUI9L4aCJOFRbr5fdHqjdhoYE8=
+github.com/trailofbits/go-fuzz-utils v0.0.0-20230413173806-58c38daa3cb4/go.mod h1:f3jBhpWvuZmue0HZK52GzRHJOYHYSILs/c8+K2S/J+o=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/gpt.go b/gpt.go
@@ -9,14 +9,17 @@ import (
 	"time"
 )
 
-const GPT2_EOT = 50256
+const (
+	GPT2_EOT int32 = 50256
+)
 
 type GPT2Config struct {
 	MaxSeqLen int `json:"max_seq_len"`
 	V         int `json:"vocab_size"`
 	L         int `json:"num_layers"`
 	NH        int `json:"num_heads"`
 	C         int `json:"channels"`
+	EOT       int32
 }
 
 type GPT2 struct {
@@ -25,20 +28,19 @@ type GPT2 struct {
 	// Params has the actual weights of the model. Params.Memory is for convenience to be able to set/reset parameters simply
 	Params ParameterTensors // Weights of the model
 	// Grads contains the delta/gradient that will eventually be applied to the params in the model
-	Grads         ParameterTensors // Gradients of the weights
-	NumParameters int              // Total number of parameters
+	Grads ParameterTensors // Gradients of the weights
 	// Fields for AdamW optimizer
 	MMemory []float32         // First moment estimates (for AdamW)
 	VMemory []float32         // Second moment estimates (for AdamW)
 	Acts    ActivationTensors // Activations of the model
 	// gradients of the activations
-	GradsActs      ActivationTensors
-	NumActivations int
-	B              int     // Current batch size (B)
-	T              int     // Current sequence length (T)
-	Inputs         []int32 // Input tokens
-	Targets        []int32 // Target tokens
-	MeanLoss       float32 // Mean loss after a forward pass
+	GradsActs ActivationTensors
+	B         int     // Current batch size (B)
+	T         int     // Current sequence length (T)
+	Inputs    []int32 // Input tokens
+	Targets   []int32 // Target tokens
+	MeanLoss  float32 // Mean loss after a forward pass
+	Rand      *rand.Rand
 }
 
 // LoadGPT2Model loads the GPT-2 model from a checkpoint file.
@@ -65,6 +67,22 @@ func LoadGPT2Model(checkpointPath, tokenizerFile string) (*GPT2, error) {
 	return model, nil
 }
 
+func newGPT2(MaxSeqLen, V, L, NH, C int, vocab []string) GPT2 {
+	model := GPT2{
+		Config: GPT2Config{
+			MaxSeqLen: MaxSeqLen,
+			V:         V,
+			L:         L,
+			NH:        NH,
+			C:         C,
+		},
+		Params:    newParameterTensors(V, C, MaxSeqLen, L),
+		Tokenizer: newTokenizer(vocab),
+		Rand:      rand.New(rand.NewSource(21)),
+	}
+	return model
+}
+
 func loadFromReader(f io.Reader) (*GPT2, error) {
 	header := make([]int32, 256)
 	err := binary.Read(f, binary.LittleEndian, header)
@@ -81,10 +99,11 @@ func loadFromReader(f io.Reader) (*GPT2, error) {
 			L:         int(header[4]),
 			NH:        int(header[5]),
 			C:         int(header[6]),
+			EOT:       GPT2_EOT,
 		},
+		Rand: rand.New(rand.NewSource(21)),
 	}
 	model.Params.Init(model.Config.V, model.Config.C, model.Config.MaxSeqLen, model.Config.L)
-	model.NumParameters = len(model.Params.Memory)
 	if err := binary.Read(f, binary.LittleEndian, model.Params.Memory); err != nil {
 		return nil, fmt.Errorf("error reading model: %v", err)
 	}
@@ -99,7 +118,7 @@ func (model *GPT2) String() string {
 	s += fmt.Sprintf("num_layers: %d\n", model.Config.L)
 	s += fmt.Sprintf("num_heads: %d\n", model.Config.NH)
 	s += fmt.Sprintf("channels: %d\n", model.Config.C)
-	s += fmt.Sprintf("num_parameters: %d\n", model.NumParameters)
+	s += fmt.Sprintf("num_parameters: %d\n", len(model.Params.Memory))
 	return s
 }
 
@@ -264,7 +283,6 @@ func (model *GPT2) Backward() error {
 	if len(model.Grads.Memory) == 0 {
 		model.Grads.Init(V, C, model.Config.MaxSeqLen, L)
 		model.GradsActs.Init(B, C, T, L, NH, V)
-		model.NumActivations = len(model.GradsActs.Memory)
 		model.ZeroGradient()
 	}
 	// backward pass
@@ -353,11 +371,11 @@ func (model *GPT2) Backward() error {
 func (model *GPT2) Update(learningRate, beta1, beta2, eps, weightDecay float32, t int) {
 	// Lazy memory allocation
 	if model.MMemory == nil {
-		model.MMemory = make([]float32, model.NumParameters)
-		model.VMemory = make([]float32, model.NumParameters)
+		model.MMemory = make([]float32, model.Params.Len())
+		model.VMemory = make([]float32, model.Params.Len())
 	}
 	// Parameter updates
-	for i := 0; i < model.NumParameters; i++ {
+	for i := 0; i < model.Params.Len(); i++ {
 		parameter := model.Params.Memory[i]
 		gradient := model.Grads.Memory[i]
 		// Momentum update
@@ -374,8 +392,8 @@ func (model *GPT2) Update(learningRate, beta1, beta2, eps, weightDecay float32,
 	}
 }
 
-func (model *GPT2) Inference(input string) (string, error) {
-	B, T := 1, 16
+func (model *GPT2) Inference(input string, B, T int) (string, error) {
+	//B, T := 1, 16
 	start := time.Now()
 	defer func() {
 		fmt.Printf("inference time took: %v\n", time.Now().Sub(start))
@@ -386,25 +404,23 @@ func (model *GPT2) Inference(input string) (string, error) {
 	}
 	if len(tokens) < T {
 		for i := len(tokens); i <= T; i++ {
-			tokens = append(tokens, GPT2_EOT)
+			tokens = append(tokens, model.Config.EOT)
 		}
 	}
 	fmt.Printf("input is %d tokens long\n", len(tokens))
 	model.Forward(tokens, tokens[1:], B, T)
-	genTokens := make([]int32, B*T)
-	const genMaxLength = 16
-	genTokens[0] = GPT2_EOT // the GPT-2 EOT token kicks off the generation
+	genTokens := make([]int32, model.Config.MaxSeqLen)
 	for i := 0; i < B*T; i++ {
-		genTokens[i] = GPT2_EOT
+		genTokens[i] = model.Config.EOT
 	}
-	for t := 1; t < genMaxLength; t++ {
+	for t := 1; t < model.Config.MaxSeqLen; t++ {
 		fmt.Printf("generating token: %d\n", t)
 		// for each t, we re-compute all activations between 0 and t
 		// leaving this alone because you want separate code for inference anyway
 		// the inference here is just for sanity checking purposes
 		model.Forward(genTokens, nil, B, t)
 		probabilities := model.Acts.Probabilities.data[(t-1)*model.Config.V:]
-		coin := rand.Float32()
+		coin := model.Rand.Float32()
 		nextToken2 := sampleMult(probabilities, coin)
 		genTokens[t] = rune(nextToken2)
 	}
@@ -433,12 +449,11 @@ func (model *GPT2) Train(valDataloader, trainDataloader *DataLoader, B, T int) e
 			valLoss /= float32(valNumBatches)
 			fmt.Printf("val loss %f\n", valLoss)
 		}
-		if true || step > 0 && step%20 == 0 {
+		if step > 0 && step%20 == 0 {
 			for i := 0; i < B*T; i++ {
-				genTokens[i] = GPT2_EOT
+				genTokens[i] = model.Config.EOT
 			}
-			genTokens[0] = GPT2_EOT // the GPT-2 EOT token kicks off the generation
-			for t := 1; t < genMaxLength; t++ {
+			for t := 1; t < len(genTokens); t++ {
 				// for each t, we re-compute all activations between 0 and t
 				// leaving this alone because you want separate code for inference anyway
 				// the inference here is just for sanity checking purposes

diff --git a/gpt_test.go b/gpt_test.go
@@ -0,0 +1,48 @@
+package llmgo
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestLoadGPT2Model(t *testing.T) {
+	tests := []struct {
+		name      string
+		maxSeqLen int
+		v         int
+		l         int
+		nh        int
+		c         int
+		vocab     []string
+		input     string
+		output    string
+	}{
+		{
+			name:      "",
+			maxSeqLen: 3,
+			v:         3,
+			l:         2,
+			nh:        1,
+			c:         1,
+			vocab:     []string{"a", "b", "c"},
+			input:     "abcd",
+			output:    "acc",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			B, T := 1, 2
+			model := newGPT2(tt.maxSeqLen, tt.v, tt.l, tt.nh, tt.c, tt.vocab)
+			tokens, err := model.Tokenizer.Encode(tt.input)
+			validation, err := newDataLoaderFromInts(tokens, B, T)
+			assert.NoError(t, err)
+			train, err := newDataLoaderFromInts(tokens, B, T)
+			assert.NoError(t, err)
+			err = model.Train(validation, train, B, T)
+			assert.NoError(t, err)
+			output, err := model.Inference(tt.input, 1, 2)
+			assert.NoError(t, err)
+			println(output)
+		})
+	}
+}
diff --git a/main_test.go b/main_test.go
@@ -4,37 +4,5 @@ import (
 	"testing"
 )
 
-func TestSmallGPT(t *testing.T) {
-	B := 4
-	T := 64
-	model := GPT2{
-		Config: GPT2Config{
-			MaxSeqLen: 64,
-			V:         50257,
-			L:         2,
-			NH:        4,
-			C:         200,
-		},
-	}
-	model.Params.Init(model.Config.V, model.Config.C, model.Config.MaxSeqLen, model.Config.L)
-	model.NumParameters = len(model.Params.Memory)
-	var s float32
-	for i := range model.Params.Memory {
-		model.Params.Memory[i] = 0.001
-		s += model.Params.Memory[i]
-	}
-	dataloader, err := NewDataLoader("./data/tiny_shakespeare_val.bin", B, T)
-	if err != nil {
-		panic(err)
-	}
-	for i := 0; i < 10; i++ {
-		inp, tar, err := dataloader.NextBatch()
-		if err != nil {
-			panic(err)
-		}
-		model.Forward(inp, tar, B, T)
-		model.ZeroGradient()
-		model.Backward()
-		model.Update(1e-4, 0.9, 0.999, 1e-8, 0.0, i+1)
-	}
+func TestGPT(t *testing.T) {
 }
diff --git a/math_test.go b/math_test.go
@@ -291,32 +291,6 @@ func TestAttentionForward(t *testing.T) {
 	}
 }
 
-func TestAttentionBackward(t *testing.T) {
-	type args struct {
-		dinp    []float32
-		dpreatt []float32
-		datt    []float32
-		dout    []float32
-		inp     []float32
-		att     []float32
-		B       int
-		T       int
-		C       int
-		NH      int
-	}
-	tests := []struct {
-		name string
-		args args
-	}{
-		// TODO: Add test cases.
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			attentionBackward(tt.args.dinp, tt.args.dpreatt, tt.args.datt, tt.args.dout, tt.args.inp, tt.args.att, tt.args.B, tt.args.T, tt.args.C, tt.args.NH)
-		})
-	}
-}
-
 func FuzzGeluInverse(f *testing.F) {
 	for i := 0; i < 1000; i++ {
 		// Generate random input values
@@ -354,7 +328,7 @@ func TestInference(t *testing.T) {
 	randomText := "Kathleen Mary Ferrier CBE (22 April 1912 – 8 October 1953)[1] was an English contralto singer who achieved an international reputation as a stage, concert and recording artist, with a repertoire extending from folksong and popular ballads to the classical works of Bach, Brahms, Mahler and Elgar. Her death from cancer, at the height of her fame, was a shock to the musical world and particularly to the general public, which was kept in ignorance of the nature of her illness until after her death.  The daughter of a Lancashire village schoolmaster, Ferrier showed early talent as a pianist, and won numerous amateur piano competitions while working as a telephonist with the General Post Office. She did not take up singing seriously until 1937, when after winning a prestigious singing competition at the Carlisle Festival she began to receive offers of professional engagements as a vocalist. Thereafter she took singing lessons, first with J. E. Hutchinson and later with Roy Henderson. After the outbreak of the Second World War Ferrier was recruited by the Council for the Encouragement of Music and the Arts (CEMA), and in the following years sang at concerts and recitals throughout the UK. In 1942 her career was boosted when she met the conductor Malcolm Sargent, who recommended her to the influential Ibbs and Tillett concert management agency. She became a regular performer at leading London and provincial venues, and made numerous BBC radio broadcasts.  In 1946 Ferrier made her stage debut in the Glyndebourne Festival premiere of Benjamin Britten's opera The Rape of Lucretia. A year later she made her first appearance as Orfeo in Gluck's Orfeo ed Euridice, a work with which she became particularly associated. By her own choice, these were her only two operatic roles. As her reputation grew, Ferrier formed close working relationships with major musical figures, including Britten, Sir John Barbirolli, Bruno Walter and the accompanist Gerald Moore. She became known internationally through her three tours to the United States between 1948 and 1950 and her many visits to continental Europe.  Ferrier was diagnosed with breast cancer in March 1951. In between periods of hospitalisation and convalescence she continued to perform and record; her final public appearance was as Orfeo, at the Royal Opera House in February 1953, eight months before her death. Among her many memorials, the Kathleen Ferrier Cancer Research Fund was launched in May 1954. The Kathleen Ferrier Scholarship Fund, administered by the Royal Philharmonic Society, has since 1956 made annual awards to aspiring young professional singers."
 	model, err := LoadGPT2Model("./gpt2_124M.bin", "./gpt2_tokenizer.bin")
 	require.NoError(t, err)
-	output, err := model.Inference(randomText)
+	output, err := model.Inference(randomText, 1, 1)
 	require.NoError(t, err)
 	t.Log(output)
 }
diff --git a/tensor.go b/tensor.go
@@ -90,6 +90,16 @@ type ParameterTensors struct {
 	LayerFinNormB tensor // (C) - Final layer normalization biases
 }
 
+func newParameterTensors(V, C, maxSeqLen, L int) ParameterTensors {
+	var tensor ParameterTensors
+	tensor.Init(V, C, maxSeqLen, L)
+	return tensor
+}
+
+func (tensor *ParameterTensors) Len() int {
+	return len(tensor.Memory)
+}
+
 // Init initialises the ParameterTensors with specific sizes for each tensor based on the model architecture.
 func (tensor *ParameterTensors) Init(V, C, maxSeqLen, L int) {
 	tensor.Memory = make([]float32,