From 7d2ea0d2d60c4bf0ee398d7e81a654e12fe5fe16 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Thu, 27 Jun 2024 18:07:09 -0700 Subject: [PATCH] pfam proper count --- lib/tokenizer/cli/main.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/tokenizer/cli/main.go b/lib/tokenizer/cli/main.go index 958a4ac..7856db7 100644 --- a/lib/tokenizer/cli/main.go +++ b/lib/tokenizer/cli/main.go @@ -18,7 +18,7 @@ import ( func main() { // Define flags - shardSize := flag.Int("shardSize", int(math.Pow(10, 8)), "Size of each shard") // uniprot sprot splits into 40 files, so 2.5% is retained for validation + shardSize := flag.Int("shardSize", int(math.Pow(10, 7)), "Size of each shard") // uniprot sprot splits into 40 files, so 2.5% is retained for validation outputDir := flag.String("outputDir", "", "Output directory path") tremblInput := flag.String("tremblInput", "", "Trembl input directory") unirefInput := flag.String("unirefInput", "", "Uniref input directory") @@ -122,10 +122,14 @@ func main() { pfamCount++ return true }) + pfamCount := make(map[string]bool) for _, values := range pfamMap { for _, pfam := range values { - pfamCount++ - tokenizer.TokenMap.Store(pfam, pfamCount) + _, ok := pfamCount[pfam] + if !ok { + pfamCount++ + tokenizer.TokenMap.Store(pfam, pfamCount) + } } } tokenizerJSON, err := tokenizer.ToJSON()