diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d4bcb86..b29dc2a8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -93,6 +93,18 @@ In order to simplify the development experience, and environment setup, the poly Whether you're a beginner with Go or you're an experienced developer, You should see the suggestions popup automatically when you goto the *Plugins* tab in VSCode. Using these plugins can help accelerate the development experience and also allow you to work more collaboratively with other poly developers. +## Local Checks + +Poly runs numerous CI/CD checks via Github Actions before a PR can be merged. In order to make your PR mergeable, your PR must pass all of these checks. + +A quick way to check your PR will pass is to run: + +```sh +gofmt -s -w . && go test ./... +``` + +Additionally, you may want to [install](https://golangci-lint.run/usage/install/#local-installation) and run the linter. + # How to report a bug ### Security disclosures diff --git a/data/NC_001141.2_redux.gb b/data/NC_001141.2_redux.gb new file mode 100644 index 00000000..6a45765c --- /dev/null +++ b/data/NC_001141.2_redux.gb @@ -0,0 +1,147 @@ +LOCUS NC_001141 439888 bp DNA linear CON 15-SEP-2023 +DEFINITION Saccharomyces cerevisiae S288C chromosome IX, complete sequence. +ACCESSION NC_001141 +VERSION NC_001141.2 +DBLINK BioProject: PRJNA128 + Assembly: GCF_000146045.2 +KEYWORDS RefSeq. +SOURCE Saccharomyces cerevisiae S288C + ORGANISM Saccharomyces cerevisiae S288C + Eukaryota; Fungi; Dikarya; Ascomycota; Saccharomycotina; + Saccharomycetes; Saccharomycetales; Saccharomycetaceae; + Saccharomyces. +REFERENCE 1 (bases 1 to 439888) + AUTHORS Engel,S.R., Wong,E.D., Nash,R.S., Aleksander,S., Alexander,M., + Douglass,E., Karra,K., Miyasato,S.R., Simison,M., Skrzypek,M.S., + Weng,S. and Cherry,J.M. + TITLE New data and collaborations at the Saccharomyces Genome Database: + updated reference genome, alleles, and the Alliance of Genome + Resources + JOURNAL Genetics 220 (4) (2022) + PUBMED 34897464 +REFERENCE 2 (bases 1 to 439888) + AUTHORS Churcher,C., Bowman,S., Badcock,K., Bankier,A., Brown,D., + Chillingworth,T., Connor,R., Devlin,K., Gentles,S., Hamlin,N., + Harris,D., Horsnell,T., Hunt,S., Jagels,K., Jones,M., Lye,G., + Moule,S., Odell,C., Pearson,D., Rajandream,M., Rice,P., Rowley,N., + Skelton,J., Smith,V., Barrell,B. et al. + TITLE The nucleotide sequence of Saccharomyces cerevisiae chromosome IX + JOURNAL Nature 387 (6632 SUPPL), 84-87 (1997) + PUBMED 9169870 +REFERENCE 3 (bases 1 to 439888) + AUTHORS Goffeau,A., Barrell,B.G., Bussey,H., Davis,R.W., Dujon,B., + Feldmann,H., Galibert,F., Hoheisel,J.D., Jacq,C., Johnston,M., + Louis,E.J., Mewes,H.W., Murakami,Y., Philippsen,P., Tettelin,H. and + Oliver,S.G. + TITLE Life with 6000 genes + JOURNAL Science 274 (5287), 546 (1996) + PUBMED 8849441 +REFERENCE 4 (bases 1 to 439888) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (14-SEP-2023) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 5 (bases 1 to 439888) + CONSRTM Saccharomyces Genome Database + TITLE Direct Submission + JOURNAL Submitted (04-MAY-2012) Department of Genetics, Stanford + University, Stanford, CA 94305-5120, USA + REMARK Protein update by submitter +REFERENCE 6 (bases 1 to 439888) + CONSRTM Saccharomyces Genome Database + TITLE Direct Submission + JOURNAL Submitted (31-MAR-2011) Department of Genetics, Stanford + University, Stanford, CA 94305-5120, USA + REMARK Sequence update by submitter +REFERENCE 7 (bases 1 to 439888) + CONSRTM Saccharomyces Genome Database + TITLE Direct Submission + JOURNAL Submitted (14-DEC-2009) Department of Genetics, Stanford + University, Stanford, CA 94305-5120, USA +COMMENT REVIEWED REFSEQ: This record has been curated by SGD. The reference + sequence is identical to BK006942. + + On Apr 26, 2011 this sequence version replaced NC_001141.1. + + ##Genome-Annotation-Data-START## + Annotation Provider :: SGD + Annotation Status :: Full Annotation + Annotation Version :: R64-4-1 + URL :: http://www.yeastgenome.org/ + ##Genome-Annotation-Data-END## + COMPLETENESS: full length. +FEATURES Location/Qualifiers + source 1..439888 + /organism="Saccharomyces cerevisiae S288C" + /mol_type="genomic DNA" + /strain="S288C" + /db_xref="taxon:559292" + /chromosome="IX" + telomere complement(1..7784) + /note="TEL09L; Telomeric region on the left arm of + Chromosome IX; composed of an X element core sequence, X + element combinatorial repeats, a long Y' element, and a + short terminal stretch of telomeric repeats" + /db_xref="SGD:S000028896" + gene complement(<483..>6147) + /locus_tag="YIL177C" + /db_xref="GeneID:854630" + mRNA complement(join(<483..4598,4987..>6147)) + /locus_tag="YIL177C" + /product="Y' element ATP-dependent helicase" + /transcript_id="NM_001179522.1" + /db_xref="GeneID:854630" + CDS complement(join(483..4598,4987..6147)) + /locus_tag="YIL177C" + /EC_number="3.6.4.12" + /note="Putative Y' element ATP-dependent helicase" + /codon_start=1 + /product="Y' element ATP-dependent helicase" + /protein_id="NP_012092.1" + /db_xref="GeneID:854630" + /db_xref="SGD:S000001439" + /translation="MKVSDRRKFEKANFDEFESALNNKNDLVHCPSITLFESIPTEVR + SFYEDEKSGLIKVVKFRTGAMDRKRSFEKVVISVMVGKNVKKFLTFVEDEPDFQGGPI + PSKYLIPKKINLMVYTLFQVHTLKFNRKDYDTLSLFYLNRGYYNELSFRVLERCHEIA + SARPNDSSTMRTFTDFVSGAPIVRSLQKSTIRKYGYNLAPYMFLLLHVDELSIFSAYQ + ASLPGEKKVDTERLKRDLCPRKPIEIKYFSQICNDMMNKKDRLGDILHIILRACALNF + GAGPRGGAGDEEDRSITNEEPIIPSVDEHGLKVCKLRSPNTPRRLRKTLDAVKALLVS + SCACTARDLDIFDDNNGVAMWKWIKILYHEVAQETTLKDSYRITLVPSSDGISLLAFA + GPQRNVYVDDTTRRIQLYTDYNKNGSSEPRLKTLDGLTSDYVFYFVTVLRQMQICALG + NSYDAFNHDPWMDVVGFEDPNQVTNRDISRIVLYSYMFLNTAKGCLVEYATFRQYMRE + LPKNAPQKLNFREMRQGLIALGRHCVGSRFETDLYESATSELMANHSVQTGRNIYGVD + SFSLTSVSGTTATLLQERASERWIQWLGLESDYHCSFSSTRNAEDVVAGEAASSNHHQ + KISRVTRKRPREPKSTNDILVAGQKLFGSSFEFRDLHQLRLCYEIYMADTPSVAVQAP + PGYGKTELFHLPLIALASKGDVEYVSFLFVPYTVLLANCMIRLGRCGCLNVAPVRNFI + EEGYDGVTDLYVGIYDDLASTNFTDRIAAWENIVECTFRTNNVKLGYLIVDEFHNFET + EVYRQSQFGGITNLDFDAFEKAIFLSGTAPEAVADAALQRIGLTGLAKKSMDINELKR + SEDLSRGLSSYPTRMFNLIKEKSEVPLGHVHKIRKKVESQPEEALKLLLALFESEPES + KAIVVASTTNEVEELACSWRKYFRVVWIHGKLGAAEKVSRTKEFVTDGSMQVLIGTKL + VTEGIDIKQLMMVIMLDNRLNIIELIQGVGRLRDGGLCYLLSRKNSWAARNRKGELPP + IKEGCITEQVREFYGLESKKGKKGQHVGCCGSRTDLSADTVELIERMDRLAEKQATAS + MSIVALPSSFQESNSSDRYRKYCSSDEDSNTCIHGSANASTNASTNAITTASTNVRTN + ATTNASTNATTNASTNASTNATTNASTNATTNSSTNATTTASTNVRTSATTTASINVR + TSATTTESTNSSTNATTTESTNSSTNATTTESTNSNTSATTTASINVRTSATTTESTN + SSTSATTTASINVRTSATTTKSINSSTNATTTESTNSNTNATTTESTNSSTNATTTES + TNSSTNATTTESTNSNTSAATTESTNSNTSATTTESTNASAKEDANKDGNAEDNRFHP + VTDINKESYKRKGSQMVLLERKKLKAQFPNTSENMNVLQFLGFRSDEIKHLFLYGIDI + YFCPEGVFTQYGLCKGCQKMFELCVCWAGQKVSYRRIAWEALAVERMLRNDEEYKEYL + EDIEPYHGDPVGYLKYFSVKRREIYSQIQRNYAWYLAITRRRETISVLDSTRGKQGSQ + VFRMSGRQIKELYFKVWSNLRESKTEVLQYFLNWDEKKCQEEWEAKDDTVVVEALEKG + GVFQRLRSMTSAGLQGPQYVKLQFSRHHRQLRSRYELSLGMHLRDQIALGVTPSKVPH + WTAFLSMLIGLFYNKTFRQKLEYLLEQISEVWLLPHWLDLANVEVLAADDTRVPLYML + MVAVHKELDSDDVPDGRFDILLCRDSSREVGE" + rep_origin 7470..8793 + /note="ARS902; Putative replication origin; identified in + multiple array studies, not yet confirmed by plasmid-based + assay" + /db_xref="SGD:S000130156" + mRNA join(<155222,155311..>155765) + /gene="COX5B" + /locus_tag="YIL111W" + /product="cytochrome c oxidase subunit Vb" + /transcript_id="NM_001179459.1" + /db_xref="GeneID:854695" +CONTIG join(BK006942.2:1..439888) +// + diff --git a/go.mod b/go.mod index 41892465..60fc07d4 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/mroth/weightedrand v0.4.1 github.com/pmezard/go-difflib v1.0.0 github.com/sergi/go-diff v1.2.0 + github.com/spaolacci/murmur3 v1.1.0 golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 lukechampine.com/blake3 v1.1.5 ) @@ -16,7 +17,6 @@ require ( require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/mattn/go-sqlite3 v1.14.13 // indirect - github.com/spaolacci/murmur3 v1.1.0 // indirect ) require ( diff --git a/io/genbank/example_test.go b/io/genbank/example_test.go index 0bc8433e..aad96753 100644 --- a/io/genbank/example_test.go +++ b/io/genbank/example_test.go @@ -14,8 +14,8 @@ import ( func Example_basic() { sequences, _ := genbank.Read("../../data/puc19.gbk") for _, feature := range sequences.Features { - if feature.Attributes["gene"] == "bla" { - fmt.Println(feature.Attributes["note"]) + if gene, ok := feature.Attributes["gene"]; feature.Type == "CDS" && ok && gene[0] == "bla" { + fmt.Println(feature.Attributes["note"][0]) } } // Output: confers resistance to ampicillin, carbenicillin, andrelated antibiotics diff --git a/io/genbank/genbank.go b/io/genbank/genbank.go index 468e17cf..dec78d59 100644 --- a/io/genbank/genbank.go +++ b/io/genbank/genbank.go @@ -66,14 +66,14 @@ type Meta struct { // Feature holds the information for a feature in a Genbank file and other annotated sequence files. type Feature struct { - Type string `json:"type"` - Description string `json:"description"` - Attributes map[string]string `json:"attributes"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` - Sequence string `json:"sequence"` - Location Location `json:"location"` - ParentSequence *Genbank `json:"-"` + Type string `json:"type"` + Description string `json:"description"` + Attributes map[string][]string `json:"attributes"` + SequenceHash string `json:"sequence_hash"` + SequenceHashFunction string `json:"hash_function"` + Sequence string `json:"sequence"` + Location Location `json:"location"` + ParentSequence *Genbank `json:"-"` } // Reference holds information for one reference in a Meta struct. @@ -125,7 +125,25 @@ var ( sequenceRegex = regexp.MustCompile("[^a-zA-Z]+") ) +// StoreFeatureSequences calls StoreSequence on all features. +// The resulting JSON is guaranteed to have useful Feature.Sequence values. +// Useful when exporting for downstream analysis, such as with json.Marshal. +func (sequence *Genbank) StoreFeatureSequences() error { + for i := range sequence.Features { + _, err := sequence.Features[i].StoreSequence() + if err != nil { + return err + } + } + return nil +} + // AddFeature adds a feature to a Genbank struct. +// NOTE: This method assumes feature is not referenced in another location +// as this only creates a shallow copy. +// If you intend to duplicate a feature from another Genbank and plan +// to modify in either location, it is recommended you first call feature.Copy() +// before passing as input to save yourself trouble. func (sequence *Genbank) AddFeature(feature *Feature) error { feature.ParentSequence = sequence sequence.Features = append(sequence.Features, *feature) @@ -137,27 +155,58 @@ func (feature Feature) GetSequence() (string, error) { return getFeatureSequence(feature, feature.Location) } +// StoreSequence infers and assigns the value of feature.Sequence +// if currently an empty string. +func (feature *Feature) StoreSequence() (string, error) { + if feature.Sequence != "" { + return feature.Sequence, nil + } + seq, err := getFeatureSequence(*feature, feature.Location) + if err == nil { + feature.Sequence = seq + } + return seq, err +} + +// Copy creates deep copy of Feature, which supports safe duplication. +func (feature *Feature) Copy() Feature { + copy := *feature + copy.Location = CopyLocation(feature.Location) + copy.Attributes = NewMultiMap[string, string]() + ForEachKey(feature.Attributes, func(k string, v []string) { + copy.Attributes[k] = MapSlice(v, identity[string]) + }) + return copy +} + +// CopyLocation creates deep copy of Location, which supports safe duplication +func CopyLocation(location Location) Location { + location.SubLocations = MapSlice(location.SubLocations, CopyLocation) + return location +} + // getFeatureSequence takes a feature and location object and returns a sequence string. func getFeatureSequence(feature Feature, location Location) (string, error) { var sequenceBuffer bytes.Buffer - var sequenceString string parentSequence := feature.ParentSequence.Sequence if len(location.SubLocations) == 0 { sequenceBuffer.WriteString(parentSequence[location.Start:location.End]) } else { for _, subLocation := range location.SubLocations { - sequence, _ := getFeatureSequence(feature, subLocation) + sequence, err := getFeatureSequence(feature, subLocation) + if err != nil { + return "", err + } sequenceBuffer.WriteString(sequence) } } // reverse complements resulting string if needed. + sequenceString := sequenceBuffer.String() if location.Complement { - sequenceString = transform.ReverseComplement(sequenceBuffer.String()) - } else { - sequenceString = sequenceBuffer.String() + sequenceString = transform.ReverseComplement(sequenceString) } return sequenceString, nil @@ -185,9 +234,10 @@ func ReadMultiNth(path string, count int) ([]Genbank, error) { return []Genbank{}, err } - sequence, err := parseMultiNthFn(file, count) - if err != nil { - return []Genbank{}, err + sequence, perr := parseMultiNthFn(file, count) + if perr != nil { + perr.file = path + return []Genbank{}, perr } return sequence, nil @@ -384,6 +434,36 @@ func ParseMulti(r io.Reader) ([]Genbank, error) { return genbankSlice, err } +// ParseError represents failures encountered while parsing, +// and pointers to it are fully compatable with the error interface. +type ParseError struct { + file string // the file origin + line string // the offending line + before bool // whether the error occurred before or on this line + lineNo int // the line number, 0 indexed + info string `default:"syntax error"` // description of the error type + wraps error // stores the error that led to this, if any +} + +func (e *ParseError) Error() string { + var out, loc string + if e.file == "" { + loc = fmt.Sprintf("line %d", e.lineNo) + } else { + loc = fmt.Sprintf("%s:%d", e.file, e.lineNo) + } + if e.before { + out = fmt.Sprintf("%s before %s", e.info, loc) + } else { + out = fmt.Sprintf("%s on %s: %s", e.info, loc, e.line) + } + if e.wraps != nil { + out = fmt.Sprintf("%s\nfrom %v", out, e.wraps) + } + return out +} + +// defines state for the parser, and utility methods to modify type parseLoopParameters struct { newLocation bool quoteActive bool @@ -406,14 +486,33 @@ type parseLoopParameters struct { // method to init loop parameters func (params *parseLoopParameters) init() { params.newLocation = true - params.feature.Attributes = make(map[string]string) + params.feature.Attributes = NewMultiMap[string, string]() params.parseStep = "metadata" params.genbankStarted = false params.genbank.Meta.Other = make(map[string]string) } +// save our completed attribute / qualifier string to the current feature +// useful as a wrap-up step from multiple states +func (params *parseLoopParameters) saveLastAttribute() { + newValue := params.attributeValue != "" + emptyType := params.feature.Type != "" + if newValue || emptyType { + if newValue { + Put(params.feature.Attributes, params.attribute, params.attributeValue) + } + params.features = append(params.features, params.feature) + + // reset attribute state + params.attributeValue = "" + params.attribute = "" + params.feature = Feature{} + params.feature.Attributes = NewMultiMap[string, string]() + } +} + // ParseMultiNth takes in a reader representing a multi gbk/gb/genbank file and parses the first n records into a slice of Genbank structs. -func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { +func ParseMultiNth(r io.Reader, count int) ([]Genbank, *ParseError) { scanner := bufio.NewScanner(r) var genbanks []Genbank @@ -446,11 +545,12 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { continue } + // define parser state machine switch parameters.parseStep { case "metadata": // Handle empty lines if len(line) == 0 { - return genbanks, fmt.Errorf("Empty metadata line on line %d", lineNum) + return genbanks, &ParseError{line: line, lineNo: lineNum, info: "unexpected empty metadata"} } // If we are currently reading a line, we need to figure out if it is a new meta line. @@ -471,7 +571,7 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { case "REFERENCE": reference, err := parseReferencesFn(parameters.metadataData) if err != nil { - return []Genbank{}, fmt.Errorf("Failed in parsing reference above line %d. Got error: %s", lineNum, err) + return []Genbank{}, &ParseError{line: line, lineNo: lineNum, wraps: err, info: "invalid reference"} } parameters.genbank.Meta.References = append(parameters.genbank.Meta.References, reference) @@ -504,7 +604,7 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields count, err := strconv.Atoi(fields[countIndex]) if err != nil { - return []Genbank{}, err + return []Genbank{}, &ParseError{line: line, lineNo: lineNum, wraps: err, info: "invalid base count"} } baseCount := BaseCount{ @@ -517,66 +617,51 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { } // Switch to sequence parsing originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN" - if originFlag { + contigFlag := strings.Contains(line, "CONTIG") + if originFlag || contigFlag { parameters.parseStep = "sequence" - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.features = append(parameters.features, parameters.feature) - parameters.attributeValue = "" - parameters.attribute = "" - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - } else { - parameters.features = append(parameters.features, parameters.feature) - } + parameters.saveLastAttribute() // add our features to the genbank for _, feature := range parameters.features { + // TODO: parse location when line is read, or track line number so error is localized location, err := parseLocation(feature.Location.GbkLocationString) if err != nil { - return []Genbank{}, err + return []Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "invalid feature location"} } feature.Location = location err = parameters.genbank.AddFeature(&feature) if err != nil { - return []Genbank{}, err + return []Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "problem adding feature"} } } + + if contigFlag { + parameters.genbank.Meta.Other["CONTIG"] = parseMetadata(splitLine[1:]) + } continue - } // end sequence parsing flag logic + } // check if current line contains anything but whitespace trimmedLine := strings.TrimSpace(line) - if len(trimmedLine) < 1 { + if len(trimmedLine) == 0 { continue } + indent := countLeadingSpaces(parameters.currentLine) // determine if current line is a new top level feature - if countLeadingSpaces(parameters.currentLine) < countLeadingSpaces(parameters.prevline) || parameters.prevline == "FEATURES" { - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.features = append(parameters.features, parameters.feature) - parameters.attributeValue = "" - parameters.attribute = "" - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - } - - // } - // checks for empty types - if parameters.feature.Type != "" { - parameters.features = append(parameters.features, parameters.feature) - } + if indent == 0 { + return genbanks, &ParseError{line: line, lineNo: lineNum, info: "unexpected metadata when parsing feature"} + } else if indent < countLeadingSpaces(parameters.prevline) || parameters.prevline == "FEATURES" { + parameters.saveLastAttribute() parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) + parameters.feature.Attributes = NewMultiMap[string, string]() // An initial feature line looks like this: `source 1..2686` with a type separated by its location if len(splitLine) < 2 { - return genbanks, fmt.Errorf("Feature line malformed on line %d. Got line: %s", lineNum, line) + return genbanks, &ParseError{line: line, lineNo: lineNum, info: "invalid feature"} } parameters.feature.Type = strings.TrimSpace(splitLine[0]) parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) @@ -599,7 +684,7 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { } // save our completed attribute / qualifier string to the current feature if parameters.attributeValue != "" || parameters.emptyAttribute { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue + Put(parameters.feature.Attributes, parameters.attribute, parameters.attributeValue) parameters.emptyAttribute = false } parameters.attributeValue = "" @@ -618,11 +703,13 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { } parameters.attributeValue = removeAttributeValueQuotes parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier + } else { + return genbanks, &ParseError{line: line, lineNo: lineNum, info: "invalid feature"} } case "sequence": if len(line) < 2 { // throw error if line is malformed - return genbanks, fmt.Errorf("Too short line found while parsing genbank sequence on line %d. Got line: %s", lineNum, line) + return genbanks, &ParseError{line: line, lineNo: lineNum, info: "too short line found while parsing genbank sequence"} } else if line[0:2] == "//" { // end of sequence parameters.genbank.Sequence = parameters.sequenceBuilder.String() @@ -844,7 +931,7 @@ func parseLocation(locationString string) (Location, error) { location.GbkLocationString = locationString if !strings.ContainsAny(locationString, "(") { // Case checks for simple expression of x..x if !strings.ContainsAny(locationString, ".") { //Case checks for simple expression x - position, err := strconv.Atoi(locationString) + position, err := strconv.Atoi(partialRegex.ReplaceAllString(locationString, "")) if err != nil { return Location{}, err } @@ -1000,13 +1087,10 @@ func BuildFeatureString(feature Feature) string { featureHeader := generateWhiteSpace(subMetaIndex) + feature.Type + whiteSpaceTrail + location + "\n" returnString := featureHeader - qualifierKeys := make([]string, 0, len(feature.Attributes)) - for key := range feature.Attributes { - qualifierKeys = append(qualifierKeys, key) - } - - for _, qualifier := range qualifierKeys { - returnString += generateWhiteSpace(qualifierIndex) + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n" + if feature.Attributes != nil { + ForEachValue(feature.Attributes, func(key string, value string) { + returnString += generateWhiteSpace(qualifierIndex) + "/" + key + "=\"" + value + "\"\n" + }) } return returnString } diff --git a/io/genbank/genbank_test.go b/io/genbank/genbank_test.go index 9466c6b1..3bc472a2 100644 --- a/io/genbank/genbank_test.go +++ b/io/genbank/genbank_test.go @@ -34,6 +34,13 @@ var singleGbkPaths = []string{ // "../../data/pichia_chr1_head.gb", } +// Ignore ParentSequence as that's a pointer which can't be serialized. +func CmpOptions() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(Feature{}, "ParentSequence"), + } +} + func TestGbkIO(t *testing.T) { tmpDataDir, err := os.MkdirTemp("", "data-*") if err != nil { @@ -49,7 +56,7 @@ func TestGbkIO(t *testing.T) { _ = Write(gbk, tmpGbkFilePath) writeTestGbk, _ := Read(tmpGbkFilePath) - if diff := cmp.Diff(gbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(gbk, writeTestGbk, CmpOptions()...); diff != "" { t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) } } // end test single gbk read, write, build, parse @@ -82,7 +89,7 @@ func TestMultiGenbankIO(t *testing.T) { writeTestGbk, _ := ReadMulti(tmpGbkFilePath) - if diff := cmp.Diff(multiGbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(multiGbk, writeTestGbk, CmpOptions()...); diff != "" { t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) } } @@ -110,7 +117,7 @@ func TestGbkLocationStringBuilder(t *testing.T) { testInputGbk, _ := Read("../../data/sample.gbk") testOutputGbk, _ := Read(tmpGbkFilePath) - if diff := cmp.Diff(testInputGbk, testOutputGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(testInputGbk, testOutputGbk, CmpOptions()...); diff != "" { t.Errorf("Issue with partial location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) } } @@ -135,7 +142,7 @@ func TestGbLocationStringBuilder(t *testing.T) { testInputGb, _ := Read("../../data/t4_intron.gb") testOutputGb, _ := Read(tmpGbFilePath) - if diff := cmp.Diff(testInputGb, testOutputGb, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(testInputGb, testOutputGb, CmpOptions()...); diff != "" { t.Errorf("Issue with either Join or complement location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) } } @@ -207,24 +214,28 @@ func TestGetSequenceMethod(t *testing.T) { } func TestLocationParser(t *testing.T) { - gbk, _ := Read("../../data/t4_intron.gb") + gbk, err := Read("../../data/t4_intron.gb") + assert.NoError(t, err) + + err = gbk.StoreFeatureSequences() + assert.NoError(t, err) // Read 1..243 - feature, _ := gbk.Features[1].GetSequence() + feature := gbk.Features[1].Sequence seq := "atgagattacaacgccagagcatcaaagattcagaagttagaggtaaatggtattttaatatcatcggtaaagattctgaacttgttgaaaaagctgaacatcttttacgtgatatgggatgggaagatgaatgcgatggatgtcctctttatgaagacggagaaagcgcaggattttggatttaccattctgacgtcgagcagtttaaagctgattggaaaattgtgaaaaagtctgtttga" if feature != seq { t.Errorf("Feature sequence parser has changed on test '1..243'. Got this:\n%s instead of \n%s", feature, seq) } // Read join(893..1441,2459..2770) - featureJoin, _ := gbk.Features[6].GetSequence() + featureJoin := gbk.Features[6].Sequence seqJoin := "atgaaacaataccaagatttaattaaagacatttttgaaaatggttatgaaaccgatgatcgtacaggcacaggaacaattgctctgttcggatctaaattacgctgggatttaactaaaggttttcctgcggtaacaactaagaagctcgcctggaaagcttgcattgctgagctaatatggtttttatcaggaagcacaaatgtcaatgatttacgattaattcaacacgattcgttaatccaaggcaaaacagtctgggatgaaaattacgaaaatcaagcaaaagatttaggataccatagcggtgaacttggtccaatttatggaaaacagtggcgtgattttggtggtgtagaccaaattatagaagttattgatcgtattaaaaaactgccaaatgataggcgtcaaattgtttctgcatggaatccagctgaacttaaatatatggcattaccgccttgtcatatgttctatcagtttaatgtgcgtaatggctatttggatttgcagtggtatcaacgctcagtagatgttttcttgggtctaccgtttaatattgcgtcatatgctacgttagttcatattgtagctaagatgtgtaatcttattccaggggatttgatattttctggtggtaatactcatatctatatgaatcacgtagaacaatgtaaagaaattttgaggcgtgaacctaaagagctttgtgagctggtaataagtggtctaccttataaattccgatatctttctactaaagaacaattaaaatatgttcttaaacttaggcctaaagatttcgttcttaacaactatgtatcacaccctcctattaaaggaaagatggcggtgtaa" if featureJoin != seqJoin { t.Errorf("Feature sequence parser has changed on test 'join(893..1441,2459..2770)'. Got this:\n%s instead of \n%s", featureJoin, seqJoin) } // Read complement(2791..3054) - featureComplement, _ := gbk.Features[10].GetSequence() + featureComplement := gbk.Features[10].Sequence seqComplement := "ttattcactacccggcatagacggcccacgctggaataattcgtcatattgtttttccgttaaaacagtaatatcgtagtaacagtcagaagaagttttaactgtggaaattttattatcaaaatactcacgagtcattttatgagtatagtattttttaccataaatggtaataggctgttctggtcctggaacttctaactcgcttgggttaggaagtgtaaaaagaactacaccagaagtatctttaaatcgtaaaatcat" if featureComplement != seqComplement { t.Errorf("Feature sequence parser has changed on test 'complement(2791..3054)'. Got this:\n%s instead of \n%s", featureComplement, seqComplement) @@ -235,14 +246,14 @@ func TestLocationParser(t *testing.T) { // that the first sequence should be appended to the reverse sequence, instead of the second sequence // getting appended to the first. Biopython appends the second sequence to the first, and that is logically // the most obvious thing to do, so we are implementing it that way. - featureJoinComplement, _ := gbk.Features[3].GetSequence() + featureJoinComplement := gbk.Features[3].Sequence seqJoinComplement := "ataccaatttaatcattcatttatatactgattccgtaagggttgttacttcatctattttataccaatgcgtttcaaccatttcacgcttgcttatatcatcaagaaaacttgcgtctaattgaactgttgaattaacacgatgccttttaacgatgcgagaaacaactacttcatctgcataaggtaatgcagcatataacagagcaggcccgccaattacacttactttagaattctgatcaagcatagtttcgaatggtgcattagggcttgacacttgaatttcgccgccagaaatgtaagttatatattgctcccaagtaatatagaaatgtgctaaatcgccgtctttagttacaggataatcacgcgcaaggtcacacaccacaatatggctacgaccaggaagtaatgtaggcaatgactggaacgttttagcacccataatcataattgtgccttcagtacgagctttaaaattctggaggtcctttttaactcgtccccatggtaaaccatcacctaaaccgaatgctaattcattaaagccgtcgaccgttttagttggaga" if featureJoinComplement != seqJoinComplement { t.Errorf("Feature sequence parser has changed on test 'join(complement(315..330),complement(339..896))'. Got this:\n%s instead of \n%s", featureJoinComplement, seqJoinComplement) } // Read complement(join(893..1098,1101..2770)) - featureComplementJoin, _ := gbk.Features[5].GetSequence() + featureComplementJoin := gbk.Features[5].Sequence seqComplementJoin := "ttacaccgccatctttcctttaataggagggtgtgatacatagttgttaagaacgaaatctttaggcctaagtttaagaacatattttaattgttctttagtagaaagatatcggaatttataaggtagaccacttattaccagctcacaaagctctttaggttcacgcctcaaaatttctttacattgttctacgtgattcatatagatatgagtattaccaccagaaaatatcaaatcccctggaataagattacacatcttagctacaatatgaactaacgtagcatatgacgcaatattaaacggtagcattatgttcagataaggtcgttaatcttaccccggaattatatccagctgcatgtcaccatgcagagcagactatatctccaacttgttaaagcaagttgtctatcgtttcgagtcacttgaccctactccccaaagggatagtcgttaggcatttatgtagaaccaattccatttatcagattttacacgataagtaactaatccagacgaaattttaaaatgtctagctgcatctgctgcacaatcaaaaataaccccatcacatgaaatctttttaatattactaggctttttacctttcatcttttctgatattttagatttagttatgtctgaatgcttatgattaaagaatgaattattttcacctgaacgatttctgcatttactacaagtataagcagaagtttgtatgcgaacaccgcacttacaaaacttatgggtttctggattccaacgcccgtttttacttccgggtttactgtaaagagctttccgaccatcaggtccaagtttaagcatcttagctttaacagtttcagaacgtttcttaataatttcttcttttaatggatgcgtagaacatgtatcaccaaacgttgcatcagcaatattgtatccattaattttagaattaagctctttaatccaaaaattttctcgttcaataatcaaatctttctcatatggaatttcttccaaaatagaacattcaaacacattaccatgtttgttaaaagacctctgaagttttatagaagaatggcatcctttttctaaatctttaaaatgcctcttccatctcttttcaaaatctttagcacttcctacatatactttattgtttaaagtatttttaatctgataaattccgcttttcataaatacctctttaaatatagaagtatttattaaagggcaagtcctacaatttagcacgggattgtctactagagaggttccccgtttagatagattacaagtataagtcaccttatactcaggcctcaattaacccaagaaaacatctactgagcgttgataccactgcaaatccaaatagccattacgcacattaaactgatagaacatatgacaaggcggtaatgccatatatttaagttcagctggattccatgcagaaacaatttgacgcctatcatttggcagttttttaatacgatcaataacttctataatttggtctacaccaccaaaatcacgccactgttttccataaattggaccaagttcaccgctatggtatcctaaatcttttgcttgattttcgtaattttcatcccagactgttttgccttggattaacgaatcgtgttgaattaatcgtaaatcatacatttgtgcttcctgataaaaaccatattagctcagcaatgcaagctttccaggcgagcttcttagttgttaccgcaggaaaacctttagttaaatcccagcgtaatttagatccgaacagagcaattgttcctgtgcctgtacgatcatcggtttcataaccattttcaaaaatgtctttaattaaatcttggtattgtttcat" if featureComplementJoin != seqComplementJoin { t.Errorf("Feature sequence parser has changed on test 'complement(join(893..1098,1101..2770))'. Got this:\n%s instead of \n%s", featureComplementJoin, seqComplementJoin) @@ -254,7 +265,7 @@ func TestGenbankNewlineParsingRegression(t *testing.T) { for _, feature := range gbk.Features { if feature.Location.Start == 410 && feature.Location.End == 1750 && feature.Type == "CDS" { - if feature.Attributes["product"] != "chromosomal replication initiator informational ATPase" { + if feature.Attributes["product"][0] != "chromosomal replication initiator informational ATPase" { t.Errorf("Newline parsing has failed.") } break @@ -262,6 +273,12 @@ func TestGenbankNewlineParsingRegression(t *testing.T) { } } +func TestDuplicateFeatureAttributes(t *testing.T) { + seq, _ := Read("../../data/bsub.gbk") + + assert.Equal(t, len(seq.Features[2].Attributes["db_xref"]), 15) +} + func BenchmarkRead(b *testing.B) { for i := 0; i < b.N; i++ { _, _ = Read("../../data/bsub.gbk") @@ -750,9 +767,9 @@ func TestBuildFeatureString(t *testing.T) { } func TestParse_error(t *testing.T) { - parseMultiErr := errors.New("parse error") + parseMultiErr := &ParseError{info: "parse error"} oldParseMultiNthFn := parseMultiNthFn - parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, error) { + parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, *ParseError) { return nil, parseMultiErr } defer func() { @@ -766,7 +783,6 @@ func TestParse_error(t *testing.T) { } func TestParseReferences_error(t *testing.T) { - parseReferencesErr := errors.New("Failed in parsing reference above line 13. Got error: ") oldParseReferencesFn := parseReferencesFn parseReferencesFn = func(metadataData []string) (Reference, error) { return Reference{}, errors.New("") @@ -776,20 +792,21 @@ func TestParseReferences_error(t *testing.T) { }() file, _ := os.Open("../../data/puc19.gbk") _, err := parseMultiNthFn(file, 1) - assert.EqualError(t, err, parseReferencesErr.Error()) + assert.Error(t, err) } func TestIssue303Regression(t *testing.T) { seq, _ := Read("../../data/puc19_303_regression.gbk") expectedAttribute := "16S rRNA(adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase" for _, feature := range seq.Features { - if feature.Attributes["locus_tag"] == "JCVISYN3A_0004" && feature.Type == "CDS" { - if feature.Attributes["product"] != expectedAttribute { - t.Errorf("Failed to get proper expected attribute. Got: %s Expected: %s", feature.Attributes["product"], expectedAttribute) + locus, ok := feature.Attributes["locus_tag"] + if ok && locus[0] == "JCVISYN3A_0004" && feature.Type == "CDS" { + if feature.Attributes["product"][0] != expectedAttribute { + t.Errorf("Failed to get proper expected attribute. Got: %s Expected: %s", feature.Attributes["product"][0], expectedAttribute) } } - if feature.Attributes["locus_tag"] == "JCVISYN3A_0051" && feature.Type == "CDS" { - if _, ok := feature.Attributes["pseudo"]; !ok { + if ok && locus[0] == "JCVISYN3A_0051" && feature.Type == "CDS" { + if _, ok = feature.Attributes["pseudo"]; !ok { t.Errorf("pseudo should be in attributes") } } @@ -802,3 +819,11 @@ func TestConsortiumRegression(t *testing.T) { t.Errorf("Failed to read consrtm. Got err: %s", err) } } + +// testcase for subset of unusual file +// includes implicit genome range with partial +// and origin is replaced with contig +func TestParseS288C_IX_Regression(t *testing.T) { + _, err := ReadMulti("../../data/NC_001141.2_redux.gb") + assert.Nil(t, err) +} diff --git a/io/genbank/multimap.go b/io/genbank/multimap.go new file mode 100644 index 00000000..3f067272 --- /dev/null +++ b/io/genbank/multimap.go @@ -0,0 +1,65 @@ +/* +This file provides utilities for working with a MultiMap, +which is simply a map which can store multiple values for a single key instead +of the usual one. + +Useful for when we expect to encounter repeated keys but we want to store all pairs, +not just the last-inserted one. This implementation has the advantage of being compatible with +json.Marshal, cmp.Diff, pretty printing, and bracket indexing out of the box. +Does not make uniqueness quarantees for key value pairs. +Currently only used in genbank.Feature.Attributes, however his may end up +being useful for other parsers which allow for repeated keys, in which case +this should be made into its own module. +*/ +package genbank + +// MultiMap is defined as a simple type alias over a map of slices. +type MultiMap[K, V comparable] map[K][]V + +// NewMultiMap creates a new empty multimap. +func NewMultiMap[K, V comparable]() MultiMap[K, V] { + return make(map[K][]V) +} + +// Put adds a key-value pair to the multimap. +func Put[K, V comparable](m MultiMap[K, V], k K, v ...V) { + if _, ok := m[k]; !ok { + m[k] = v + } else { + m[k] = append(m[k], v...) + } +} + +// ForEachKey iterates over the multimap, once for each key with all values passed as a slice. +// do is a callback that takes the key, values slice for that key +// This exists purely as a convenience function, if your use case would benefit from +// early break/return, it is recommended you do the usual range iteration operator. +func ForEachKey[K, V comparable](m MultiMap[K, V], do func(K, []V)) { + for k, values := range m { + do(k, values) + } +} + +// ForEachValue iterates over the multimap, once for each value +// do is a callback that takes and a key and value. +func ForEachValue[K, V comparable](m MultiMap[K, V], do func(K, V)) { + ForEachKey(m, func(k K, values []V) { + for _, v := range values { + do(k, v) + } + }) +} + +// MapSlice efficiently applies a transformation to each element of a slice to create a new slice +func MapSlice[X any, Y any](slice []X, mapper func(X) Y) []Y { + y := make([]Y, len(slice)) + for i, x := range slice { + y[i] = mapper(x) + } + return y +} + +// identity returns its input, useful for using MapSlice to do a shallow copy +func identity[X any](x X) X { + return x +} diff --git a/io/polyjson/polyjson.go b/io/polyjson/polyjson.go index 81370480..d723dfdb 100644 --- a/io/polyjson/polyjson.go +++ b/io/polyjson/polyjson.go @@ -13,6 +13,7 @@ import ( "os" "time" + "github.com/TimothyStiles/poly/io/genbank" "github.com/TimothyStiles/poly/transform" ) @@ -153,6 +154,82 @@ func Write(sequence Poly, path string) error { return os.WriteFile(path, file, 0644) } +// Utilities to convert polyjson objects -> their genbank equivalents +// TODO add convert <- genbank methods, which is currently difficult as most +// genbank Meta values are discarded due to lack of support for wildcard metadata in polyjson. + +func (sequence *Poly) ToGenbank() genbank.Genbank { + gb := genbank.Genbank{ + Meta: sequence.Meta.ToGenbank(), + Features: make([]genbank.Feature, len(sequence.Features)), + Sequence: sequence.Sequence, + } + for i, f := range sequence.Features { + gb.Features[i] = f.ToGenbank() + gb.Features[i].ParentSequence = &gb + } + return gb +} + +func (meta *Meta) ToGenbank() genbank.Meta { + other := make(map[string]string) + if meta.URL != "" { + other["URL"] = meta.URL + } + if meta.CreatedBy != "" { + other["CreatedBy"] = meta.CreatedBy + } + if meta.CreatedWith != "" { + other["CreatedWith"] = meta.CreatedWith + } + other["CreatedOn"] = meta.CreatedOn.String() + if meta.Schema != "" { + other["Schema"] = meta.Schema + } + return genbank.Meta{ + Definition: meta.Description, + Source: meta.CreatedBy, + Origin: meta.CreatedWith, + Name: meta.Name, + SequenceHash: meta.Hash, + Other: other, + } +} + +func (feature *Feature) ToGenbank() genbank.Feature { + attributes := genbank.NewMultiMap[string, string]() + for key, value := range feature.Tags { + genbank.Put(attributes, key, value) + } + genbank.Put(attributes, "Name", feature.Name) + + return genbank.Feature{ + Type: feature.Type, + Description: feature.Description, + Attributes: attributes, + SequenceHash: feature.Hash, + Sequence: feature.Sequence, + Location: feature.Location.ToGenbank(), + } +} + +func (location *Location) ToGenbank() genbank.Location { + loc := genbank.Location{ + Start: location.Start, + End: location.End, + Complement: location.Complement, + Join: location.Join, + FivePrimePartial: location.FivePrimePartial, + ThreePrimePartial: location.ThreePrimePartial, + SubLocations: genbank.MapSlice( + location.SubLocations, + func(s Location) genbank.Location { return s.ToGenbank() }, + ), + } + loc.GbkLocationString = genbank.BuildLocationString(loc) + return loc +} + /****************************************************************************** JSON specific IO related things end here. diff --git a/io/polyjson/polyjson_test.go b/io/polyjson/polyjson_test.go index eaee25b1..dedd7039 100644 --- a/io/polyjson/polyjson_test.go +++ b/io/polyjson/polyjson_test.go @@ -104,3 +104,9 @@ func TestWrite_error(t *testing.T) { err := Write(Poly{}, "/tmp/file") assert.EqualError(t, err, marshalIndentErr.Error()) } + +func TestConvert(t *testing.T) { + sequence, err := Read("../../data/cat.json") + assert.NoError(t, err) + sequence.ToGenbank() +} diff --git a/tutorials/001_input_output_test.go b/tutorials/001_input_output_test.go index c44eee48..aeecce81 100644 --- a/tutorials/001_input_output_test.go +++ b/tutorials/001_input_output_test.go @@ -46,6 +46,13 @@ TTFN, Tim ******************************************************************************/ +// Ignore ParentSequence as that's a pointer which can't be serialized. +func CmpOptions() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence"), + } +} + // if you're using VS-CODE you should see a DEBUG TEST button right below this // comment. Please set break points and use it early and often. func TestFileIOTutorial(t *testing.T) { @@ -166,8 +173,7 @@ func TestFileIOTutorial(t *testing.T) { } // compare our read-in plasmid to the the one we wrote out. - // Ignore ParentSequence as that's a pointer which can't be serialized. - if diff := cmp.Diff(puc19, puc19Copy, []cmp.Option{cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(puc19, puc19Copy, CmpOptions()...); diff != "" { t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(puc19Path), diff) } @@ -190,7 +196,7 @@ func TestFileIOTutorial(t *testing.T) { if err := json.Unmarshal(jsonContent, &unmarshaledPuc19); err != nil { t.Error(err) } - if diff := cmp.Diff(puc19, unmarshaledPuc19, []cmp.Option{cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(puc19, unmarshaledPuc19, CmpOptions()...); diff != "" { t.Errorf("Parsing the JSON does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(puc19Path), diff) }