Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replaced base64 with base58 in seqhash #69

Merged
merged 4 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Updated seqhash2 to use base58 rather than base64 [#69](https://github.com/Koeng101/dnadesign/pull/69)
- Updated dual barcodes primer sets to be created without csv files [#67](https://github.com/Koeng101/dnadesign/pull/67)
- Added workers to bio as a way to process data [#62](https://github.com/Koeng101/dnadesign/pull/62)
- Improved megamash efficiency and added []Match JSON conversion [#61](https://github.com/Koeng101/dnadesign/pull/61)
Expand Down
67 changes: 67 additions & 0 deletions lib/seqhash/base58.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package seqhash

import (
"errors"
"math/big"
"strings"
)

const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"

// encodeToBase58 encodes a byte slice to a Base58 string
func encodeToBase58(input []byte) string {
// Convert byte slice to a big.Int
num := big.NewInt(0).SetBytes(input)
base := big.NewInt(int64(len(alphabet)))
mod := &big.Int{}
var encoded strings.Builder

// Convert to base58
for num.Sign() > 0 {
num.DivMod(num, base, mod)
encoded.WriteByte(alphabet[mod.Int64()])
}

// Add '1' for each leading 0 byte
for _, b := range input {
if b != 0 {
break
}
encoded.WriteByte('1')
}

// Reverse the encoded string
result := []byte(encoded.String())
for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
result[i], result[j] = result[j], result[i]
}

return string(result)
}

// decodeFromBase58 decodes a Base58 string to a byte slice
func decodeFromBase58(input string) ([]byte, error) {
if len(input) == 0 {
return []byte{}, nil
}

num := big.NewInt(0)
base := big.NewInt(int64(len(alphabet)))
for _, c := range input {
charIndex := strings.IndexRune(alphabet, c)
if charIndex == -1 {
return nil, errors.New("invalid character found")
}
num.Mul(num, base)
num.Add(num, big.NewInt(int64(charIndex)))
}

decoded := num.Bytes()
// Add leading zeros
if input[0] == '1' {
leadingZeros := len(input) - len(strings.TrimLeft(input, "1"))
decoded = append(make([]byte, leadingZeros), decoded...)
}

return decoded, nil
}
2 changes: 1 addition & 1 deletion lib/seqhash/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func Example_basic() {

sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded))
fmt.Println(sequenceSeqhash)
// Output: C_JJgg9ahMxAQzDm2XveE7WA==
// Output: C_5X6Hudy3K8ht7r4mvu9Gco
}

func ExampleRotateSequence() {
Expand Down
57 changes: 45 additions & 12 deletions lib/seqhash/seqhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,18 @@ much shorter. The intended use case are for handling sequences with LLM systems
since these system's context window is a value resource, and smaller references
allows the system to be more focused. Seqhash version 2 are approximately 3x
smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can
be also encoded with base64 to get a hash that can be used as a string across
be also encoded with base58 to get a hash that can be used as a string across
different systems. Here is a length comparison:

version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
version 2: C_JPQCj5PgjFwjy7jaoYmwqQ==
version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1508a615f46350
version 2: C_5X6Hudy3K8ht7r4mvu9Gco

The metadata is now encoded in a 1 byte flag rather than a metadata string,
instead of 7 rune like in version 1. Rather than use 256 bits for encoding
the hash, we use 120 bits. Since seqhashes are not meant for security, this
is good enough (50% collision with 1.3x10^18 hashes), while making them
conveniently only 16 btyes long. Additionally, encoded prefixes are added
to the front of the base64 encoded hash as a heuristic device for LLMs while
to the front of the base58 encoded hash as a heuristic device for LLMs while
processing batches of seqhashes.

In addition, seqhashes can now encode fragments. Fragments are double stranded
Expand All @@ -68,12 +68,17 @@ overhangs flanking both sides. These fragments can encode genetic parts - and
an important part of any vector containing these parts would be the part
seqhash, rather than the vector seqhash. This enhancement allows you to
identify genetic parts irregardless of their context.

Base58 is used rather than base64 so that seqhashes can easily be added into
urls without a "/" in the identifier. Ironically, it also makes smaller hashes
than base64 due to base64 chunking 3 bytes at a time - at 16 bytes, 2 blank
bytes are added to make the seqhash divisible by 3. Base58 chunks differently,
and so doesn't encounter this problem.
*/
package seqhash

import (
"crypto/sha256"
"encoding/base64"
"errors"
"sort"
"strings"
Expand Down Expand Up @@ -190,7 +195,7 @@ var (
func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte {
var flag byte

// Encode the version (assuming version is in the range 0-15)
// Encode the version (assuming version is in the range 0-16)
flag |= (byte(version) << hash2versionShift)

// Encode the circularity
Expand Down Expand Up @@ -285,9 +290,9 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
flag := EncodeFlag(2, sequenceType, circular, doubleStranded)
result[0] = flag

// Compute BLAKE3, then copy those to the remaining 15 bytes
// Compute BLAKE3, then copy those to the remaining 16 bytes
newhash := sha256.Sum256([]byte(deterministicSequence))
copy(result[1:], newhash[:15])
copy(result[1:], newhash[:16])

return result, nil
}
Expand All @@ -305,7 +310,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
// enzymes.
//
// In order to make sure fwdOverhangLength and revOverhangLength fit in the
// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are
// hash, the hash is truncated at 13 bytes rather than 16, and both int8 are
// inserted. So the bytes would be:
//
// flag + fwdOverhangLength + revOverhangLength + [13]byte(hash)
Expand Down Expand Up @@ -387,12 +392,40 @@ var Hash2Metadata = map[Hash2MetadataKey]rune{
{FRAGMENT, true, true}: 'N',
}

// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single
// EncodeHash2 encodes Hash2 as a base58 string. It also adds a single
// letter metadata tag that can be used as an easy heuristic for an LLM to
// identify misbehaving code.
func EncodeHash2(hash [16]byte, err error) (string, error) {
if err != nil {
return "", err
}
_, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0])
encoded := base64.StdEncoding.EncodeToString(hash[:])
encoded := encodeToBase58(hash[:])

return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, nil
}

// DecodeHash2 decodes a seqhash into a [16]byte, including the metadata tag.
func DecodeHash2(encodedString string) ([16]byte, error) {
// First, we need to decompose the the string into the metadata and the
// seqhash.
parts := strings.SplitN(encodedString, "_", 2)
if len(parts) != 2 {
return [16]byte{}, errors.New("invalid encoded string format")
}

// Decode the Base58 encoded part
decodedBytes, err := decodeFromBase58(parts[1])
if err != nil {
return [16]byte{}, err
}

// Ensure decoded bytes fit into a [16]byte array
if len(decodedBytes) != 16 {
return [16]byte{}, errors.New("decoded hash does not match expected length")
}

return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err
var hash [16]byte
copy(hash[:], decodedBytes)
return hash, nil
}
82 changes: 69 additions & 13 deletions lib/seqhash/seqhash_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package seqhash

import (
"bytes"
"errors"
"fmt"
"os"
"testing"
Expand Down Expand Up @@ -34,37 +35,85 @@ func TestHash2(t *testing.T) {

// Test circular double stranded hashing
seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true))
if seqhash != "A_LGxts7bxq55Uiq+E94pcYg==" {
t.Errorf("Circular double stranded hashing failed. Expected A_LGxts7bxq55Uiq+E94pcYg==, got: " + seqhash)
if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" {
t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash)
}
// Test circular single stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false))
if seqhash != "B_KB3s/EXx/C9wJvVE/gzw7Q==" {
t.Errorf("Circular single stranded hashing failed. Expected B_KB3s/EXx/C9wJvVE/gzw7Q==, got: " + seqhash)
if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" {
t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash)
}
// Test linear double stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true))
if seqhash != "C_JN15Uk5YpkXcKaJt0ozLRQ==" {
t.Errorf("Linear double stranded hashing failed. Expected C_JN15Uk5YpkXcKaJt0ozLRQ==, got: " + seqhash)
if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" {
t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash)
}
// Test linear single stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false))
if seqhash != "D_IC0pLlPHC/zPQpSqU6hy0A==" {
t.Errorf("Linear single stranded hashing failed. Expected D_IC0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash)
if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" {
t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash)
}

// Test RNA Seqhash
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false))
if seqhash != "H_IS0pLlPHC/zPQpSqU6hy0A==" {
t.Errorf("Linear single stranded hashing failed. Expected H_IS0pLlPHC/zPQpSqU6hy0A==, got: " + seqhash)
if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" {
t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash)
}
// Test Protein Seqhash
seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false))
if seqhash != "I_IiAwHj+EfYcQCf6Ty64wUg==" {
t.Errorf("Linear single stranded hashing failed. Expected I_IiAwHj+EfYcQCf6Ty64wUg==, got: " + seqhash)
if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" {
t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash)
}
}

func TestEncodeAndDecode(t *testing.T) {
rawBytes, err := Hash2("ATGC", "DNA", false, true)
if err != nil {
t.Errorf("Got bad hash: %s", err)
}
encoded, err := EncodeHash2(rawBytes, err)
if err != nil {
t.Errorf("Failed to encode: %s", err)
}
decoded, err := DecodeHash2(encoded)
if err != nil {
t.Errorf("Failed to decode: %s", err)
}
for i := range rawBytes {
if rawBytes[i] != decoded[i] {
t.Errorf("Failed to decode properly.")
}
}
_, err = EncodeHash2([16]byte{}, errors.New("test"))
if err == nil {
t.Errorf("should fail on test error")
}

// Test no metadata
_, err = DecodeHash2("")
if err == nil {
t.Errorf("should fail on no metadata")
}
// Test empty decode
_, err = DecodeHash2("A_")
if err == nil {
t.Errorf("should fail on empty data")
}
// Test bad char
_, err = DecodeHash2("A_/")
if err == nil {
t.Errorf("should fail on bad character")
}
// Test 1s
_, err = DecodeHash2("A_11111")
if err == nil {
t.Errorf("should fail on 1s because length is wrong.")
}

// just to make sure gocov goes through
_ = encodeToBase58([]byte{0, 0, 0, 0})
}

func TestLeastRotation(t *testing.T) {
file, _ := os.Open("../data/puc19.gbk")
defer file.Close()
Expand Down Expand Up @@ -110,7 +159,14 @@ func TestHash2Fragment(t *testing.T) {
}
// Test actual hash
sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4))
expectedHash := "K_IwQE3XlSTlimRdwpom3SjA=="
expectedHash := "K_5KnZQEnPRzJSYPkbPwLCJF"
if sqHash != expectedHash {
t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
}

// Test another hash
sqHash, _ = EncodeHash2(Hash2Fragment("TTAGCCCAT", 4, 4))
expectedHash = "K_5KnZQEnPRzJSYPkbPwLCJF"
if sqHash != expectedHash {
t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
}
Expand Down
Loading