From 8c8b33b1f5fdd500ac50f7ee505151b45d867d7e Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 26 Mar 2024 13:57:51 -0700 Subject: [PATCH 1/4] Replaced base64 with base58 in seqhash --- lib/seqhash/example_test.go | 2 +- lib/seqhash/seqhash.go | 57 +++++++++++++++++++------ lib/seqhash/seqhash_test.go | 83 +++++++++++++++++++++++++++++++------ 3 files changed, 116 insertions(+), 26 deletions(-) diff --git a/lib/seqhash/example_test.go b/lib/seqhash/example_test.go index b9595b4..08107f4 100644 --- a/lib/seqhash/example_test.go +++ b/lib/seqhash/example_test.go @@ -17,7 +17,7 @@ func Example_basic() { sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded)) fmt.Println(sequenceSeqhash) - // Output: C_JJgg9ahMxAQzDm2XveE7WA== + // Output: C_5X6Hudy3K8ht7r4mvu9Gco } func ExampleRotateSequence() { diff --git a/lib/seqhash/seqhash.go b/lib/seqhash/seqhash.go index fb3ccda..c42a065 100644 --- a/lib/seqhash/seqhash.go +++ b/lib/seqhash/seqhash.go @@ -48,18 +48,18 @@ much shorter. The intended use case are for handling sequences with LLM systems since these system's context window is a value resource, and smaller references allows the system to be more focused. Seqhash version 2 are approximately 3x smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can -be also encoded with base64 to get a hash that can be used as a string across +be also encoded with base58 to get a hash that can be used as a string across different systems. Here is a length comparison: - version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 - version 2: C_JPQCj5PgjFwjy7jaoYmwqQ== + version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1508a615f46350 + version 2: C_5X6Hudy3K8ht7r4mvu9Gco The metadata is now encoded in a 1 byte flag rather than a metadata string, instead of 7 rune like in version 1. Rather than use 256 bits for encoding the hash, we use 120 bits. Since seqhashes are not meant for security, this is good enough (50% collision with 1.3x10^18 hashes), while making them conveniently only 16 btyes long. Additionally, encoded prefixes are added -to the front of the base64 encoded hash as a heuristic device for LLMs while +to the front of the base58 encoded hash as a heuristic device for LLMs while processing batches of seqhashes. In addition, seqhashes can now encode fragments. Fragments are double stranded @@ -68,12 +68,17 @@ overhangs flanking both sides. These fragments can encode genetic parts - and an important part of any vector containing these parts would be the part seqhash, rather than the vector seqhash. This enhancement allows you to identify genetic parts irregardless of their context. + +Base58 is used rather than base64 so that seqhashes can easily be added into +urls without a "/" in the identifier. Ironically, it also makes smaller hashes +than base64 due to base64 chunking 3 bytes at a time - at 16 bytes, 2 blank +bytes are added to make the seqhash divisible by 3. Base58 chunks differently, +and so doesn't encounter this problem. */ package seqhash import ( "crypto/sha256" - "encoding/base64" "errors" "sort" "strings" @@ -190,7 +195,7 @@ var ( func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte { var flag byte - // Encode the version (assuming version is in the range 0-15) + // Encode the version (assuming version is in the range 0-16) flag |= (byte(version) << hash2versionShift) // Encode the circularity @@ -285,9 +290,9 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra flag := EncodeFlag(2, sequenceType, circular, doubleStranded) result[0] = flag - // Compute BLAKE3, then copy those to the remaining 15 bytes + // Compute BLAKE3, then copy those to the remaining 16 bytes newhash := sha256.Sum256([]byte(deterministicSequence)) - copy(result[1:], newhash[:15]) + copy(result[1:], newhash[:16]) return result, nil } @@ -305,7 +310,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra // enzymes. // // In order to make sure fwdOverhangLength and revOverhangLength fit in the -// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are +// hash, the hash is truncated at 13 bytes rather than 16, and both int8 are // inserted. So the bytes would be: // // flag + fwdOverhangLength + revOverhangLength + [13]byte(hash) @@ -387,12 +392,40 @@ var Hash2Metadata = map[Hash2MetadataKey]rune{ {FRAGMENT, true, true}: 'N', } -// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single +// EncodeHash2 encodes Hash2 as a base58 string. It also adds a single // letter metadata tag that can be used as an easy heuristic for an LLM to // identify misbehaving code. func EncodeHash2(hash [16]byte, err error) (string, error) { + if err != nil { + return "", err + } _, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0]) - encoded := base64.StdEncoding.EncodeToString(hash[:]) + encoded := encodeToBase58(hash[:]) + + return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, nil +} + +// DecodeHash2 decodes a seqhash into a [16]byte, including the metadata tag. +func DecodeHash2(encodedString string) ([16]byte, error) { + // First, we need to decompose the the string into the metadata and the + // seqhash. + parts := strings.SplitN(encodedString, "_", 2) + if len(parts) != 2 { + return [16]byte{}, errors.New("invalid encoded string format") + } + + // Decode the Base58 encoded part + decodedBytes, err := decodeFromBase58(parts[1]) + if err != nil { + return [16]byte{}, err + } + + // Ensure decoded bytes fit into a [16]byte array + if len(decodedBytes) != 16 { + return [16]byte{}, errors.New("decoded hash does not match expected length") + } - return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err + var hash [16]byte + copy(hash[:], decodedBytes) + return hash, nil } diff --git a/lib/seqhash/seqhash_test.go b/lib/seqhash/seqhash_test.go index 82e3622..b1e99c0 100644 --- a/lib/seqhash/seqhash_test.go +++ b/lib/seqhash/seqhash_test.go @@ -2,6 +2,7 @@ package seqhash import ( "bytes" + "errors" "fmt" "os" "testing" @@ -34,37 +35,86 @@ func TestHash2(t *testing.T) { // Test circular double stranded hashing seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true)) - if seqhash != "A_LGxts7bxq55Uiq+E94pcYg==" { - t.Errorf("Circular double stranded hashing failed. Expected A_LGxts7bxq55Uiq+E94pcYg==, got: " + seqhash) + if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" { + t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash) } // Test circular single stranded hashing seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false)) - if seqhash != "B_KB3s/EXx/C9wJvVE/gzw7Q==" { - t.Errorf("Circular single stranded hashing failed. Expected B_KB3s/EXx/C9wJvVE/gzw7Q==, got: " + seqhash) + if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" { + t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash) } // Test linear double stranded hashing seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true)) - if seqhash != "C_JN15Uk5YpkXcKaJt0ozLRQ==" { - t.Errorf("Linear double stranded hashing failed. Expected C_JN15Uk5YpkXcKaJt0ozLRQ==, got: " + seqhash) + if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" { + t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash) } // Test linear single stranded hashing seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false)) - if seqhash != "D_IC0pLlPHC/zPQpSqU6hy0A==" { - t.Errorf("Linear single stranded hashing failed. Expected D_IC0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash) + if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" { + t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash) } // Test RNA Seqhash seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false)) - if seqhash != "H_IS0pLlPHC/zPQpSqU6hy0A==" { - t.Errorf("Linear single stranded hashing failed. Expected H_IS0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash) + if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" { + t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash) } // Test Protein Seqhash seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false)) - if seqhash != "I_IiAwHj+EfYcQCf6Ty64wUg==" { - t.Errorf("Linear single stranded hashing failed. Expected I_IiAwHj+EfYcQCf6Ty64wUg==, got: " + seqhash) + if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" { + t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash) } } +func TestEncodeAndDecode(t *testing.T) { + rawBytes, err := Hash2("ATGC", "DNA", false, true) + if err != nil { + t.Errorf("Got bad hash: %s", err) + } + encoded, err := EncodeHash2(rawBytes, err) + if err != nil { + t.Errorf("Failed to encode: %s", err) + } + decoded, err := DecodeHash2(encoded) + if err != nil { + t.Errorf("Failed to decode: %s", err) + } + for i := range rawBytes { + if rawBytes[i] != decoded[i] { + t.Errorf("Failed to decode properly.") + } + } + _, err = EncodeHash2([16]byte{}, errors.New("test")) + if err == nil { + t.Errorf("should fail on test error") + } + + // Test no metadata + _, err = DecodeHash2("") + if err == nil { + t.Errorf("should fail on no metadata") + } + // Test empty decode + _, err = DecodeHash2("A_") + if err == nil { + t.Errorf("should fail on empty data") + } + // Test bad char + _, err = DecodeHash2("A_/") + if err == nil { + t.Errorf("should fail on bad character") + } + // Test 1s + _, err = DecodeHash2("A_11111") + if err == nil { + t.Errorf("should fail on 1s because length is wrong.") + } + + // just to make sure gocov goes through + _ = encodeToBase58([]byte{0, 0, 0, 0}) + +} + func TestLeastRotation(t *testing.T) { file, _ := os.Open("../data/puc19.gbk") defer file.Close() @@ -110,7 +160,14 @@ func TestHash2Fragment(t *testing.T) { } // Test actual hash sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4)) - expectedHash := "K_IwQE3XlSTlimRdwpom3SjA==" + expectedHash := "K_5KnZQEnPRzJSYPkbPwLCJF" + if sqHash != expectedHash { + t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) + } + + // Test another hash + sqHash, _ = EncodeHash2(Hash2Fragment("TTAGCCCAT", 4, 4)) + expectedHash = "K_5KnZQEnPRzJSYPkbPwLCJF" if sqHash != expectedHash { t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) } From db6f42eba3afe5ae8c4ba3d7d89dc765d047d536 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 26 Mar 2024 14:00:40 -0700 Subject: [PATCH 2/4] add to changelog --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f2feab3..4044859 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Updated seqhash2 to use base58 rather than base64 [#69](https://github.com/Koeng101/dnadesign/pull/69) - Updated dual barcodes primer sets to be created without csv files [#67](https://github.com/Koeng101/dnadesign/pull/67) - Added workers to bio as a way to process data [#62](https://github.com/Koeng101/dnadesign/pull/62) - Improved megamash efficiency and added []Match JSON conversion [#61](https://github.com/Koeng101/dnadesign/pull/61) From 2bc0e95a8f6fd56251916df79d2e3bbcba6e3db0 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 26 Mar 2024 14:01:45 -0700 Subject: [PATCH 3/4] make linter happy --- lib/seqhash/seqhash_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/seqhash/seqhash_test.go b/lib/seqhash/seqhash_test.go index b1e99c0..915df7a 100644 --- a/lib/seqhash/seqhash_test.go +++ b/lib/seqhash/seqhash_test.go @@ -112,7 +112,6 @@ func TestEncodeAndDecode(t *testing.T) { // just to make sure gocov goes through _ = encodeToBase58([]byte{0, 0, 0, 0}) - } func TestLeastRotation(t *testing.T) { From d92c9bbe65907e01297af207266dc9e998ee0b4d Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 26 Mar 2024 14:10:27 -0700 Subject: [PATCH 4/4] add base58 --- lib/seqhash/base58.go | 67 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 lib/seqhash/base58.go diff --git a/lib/seqhash/base58.go b/lib/seqhash/base58.go new file mode 100644 index 0000000..31dd0dc --- /dev/null +++ b/lib/seqhash/base58.go @@ -0,0 +1,67 @@ +package seqhash + +import ( + "errors" + "math/big" + "strings" +) + +const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz" + +// encodeToBase58 encodes a byte slice to a Base58 string +func encodeToBase58(input []byte) string { + // Convert byte slice to a big.Int + num := big.NewInt(0).SetBytes(input) + base := big.NewInt(int64(len(alphabet))) + mod := &big.Int{} + var encoded strings.Builder + + // Convert to base58 + for num.Sign() > 0 { + num.DivMod(num, base, mod) + encoded.WriteByte(alphabet[mod.Int64()]) + } + + // Add '1' for each leading 0 byte + for _, b := range input { + if b != 0 { + break + } + encoded.WriteByte('1') + } + + // Reverse the encoded string + result := []byte(encoded.String()) + for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 { + result[i], result[j] = result[j], result[i] + } + + return string(result) +} + +// decodeFromBase58 decodes a Base58 string to a byte slice +func decodeFromBase58(input string) ([]byte, error) { + if len(input) == 0 { + return []byte{}, nil + } + + num := big.NewInt(0) + base := big.NewInt(int64(len(alphabet))) + for _, c := range input { + charIndex := strings.IndexRune(alphabet, c) + if charIndex == -1 { + return nil, errors.New("invalid character found") + } + num.Mul(num, base) + num.Add(num, big.NewInt(int64(charIndex))) + } + + decoded := num.Bytes() + // Add leading zeros + if input[0] == '1' { + leadingZeros := len(input) - len(strings.TrimLeft(input, "1")) + decoded = append(make([]byte, leadingZeros), decoded...) + } + + return decoded, nil +}