Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Io to bio #1

Merged
merged 70 commits into from
Dec 7, 2023
Merged
Changes from 1 commit
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
b87947b
Moved io to bio
Koeng101 Aug 24, 2023
5c887a3
fixed io imports
Koeng101 Aug 24, 2023
d8f4b38
Add more generic definitions to bio
Koeng101 Aug 31, 2023
4fb41ff
Update bio/fastq/fastq.go
Koeng101 Sep 1, 2023
2452282
update fasta
Koeng101 Sep 1, 2023
6dda2b9
Merge branch 'ioToBio' of github.com:TimothyStiles/poly into ioToBio
Koeng101 Sep 1, 2023
16fbcbd
add fasta updates and parser
Koeng101 Sep 1, 2023
382a014
made readability improvements
Koeng101 Sep 2, 2023
0bbd05e
changed ParseWithHeader
Koeng101 Sep 2, 2023
eb68f81
removed int64 in reads
Koeng101 Sep 2, 2023
344220c
add more example tests
Koeng101 Sep 2, 2023
03f8b68
gotta update this for this tests!
Koeng101 Sep 2, 2023
6199c43
integrate slow5
Koeng101 Sep 5, 2023
65f0539
have examples covering most of changes
Koeng101 Sep 5, 2023
8ff6da4
removed interfaces
Koeng101 Sep 5, 2023
00732a4
updated with NewXXXParser
Koeng101 Sep 7, 2023
3ce8109
added 3 parsers
Koeng101 Sep 7, 2023
630bd88
added pileup
Koeng101 Sep 7, 2023
df98fe3
add concurrent functions plus better documentation
Koeng101 Sep 9, 2023
fa4d29a
moved svb to ioToBio
Koeng101 Sep 11, 2023
f80b317
Improve tests
Koeng101 Sep 11, 2023
37859a8
make better docs for header
Koeng101 Sep 11, 2023
e24801b
Update bio/fasta/fasta_test.go
Koeng101 Sep 12, 2023
584b73e
changed name of LowLevelParser to parserInterface
Koeng101 Sep 12, 2023
da7118a
Merge branch 'main' into ioToBio
Koeng101 Sep 12, 2023
5e6204f
zw -> zipWriter
Koeng101 Sep 12, 2023
90316d3
remove a identifier from pileup
Koeng101 Sep 12, 2023
7b2cd52
genbank parser now compatible
Koeng101 Sep 13, 2023
9b55fda
writeTo interface now fulfilled
Koeng101 Sep 13, 2023
6655565
make linter happy :)
Koeng101 Sep 13, 2023
11972ae
convert all types to io.WriterTo
Koeng101 Sep 14, 2023
12a4b48
fixed linter issues
Koeng101 Sep 14, 2023
4b50625
handle EOF better
Koeng101 Sep 14, 2023
f44721c
fixed tutorial
Koeng101 Sep 14, 2023
b192fda
fix genbank read error
Koeng101 Sep 14, 2023
3eab1f9
remove io.WriterTo checks
Koeng101 Sep 14, 2023
0edfd1c
fix with cmp.Equal
Koeng101 Sep 16, 2023
34de749
Merge pull request #341 from TimothyStiles/slow5StreamVByte2
Koeng101 Sep 16, 2023
6abe0cd
Merge branch 'main' into ioToBio
Koeng101 Oct 28, 2023
4c61c22
genbank tests merged
Koeng101 Oct 28, 2023
1d23668
sample merge
Koeng101 Oct 28, 2023
56772bb
Merge branch 'main' of github.com:TimothyStiles/poly into ioToBio
Koeng101 Oct 28, 2023
956d26e
make linter happy
Koeng101 Oct 28, 2023
158fcf1
Added generic collections module
abondrn Oct 30, 2023
8862a6c
Switched Feature.Attributes to use multimap
abondrn Oct 30, 2023
ef07e94
Fixed tests
abondrn Oct 30, 2023
cac1e55
Ran linter
abondrn Oct 30, 2023
8025bc2
Added copy methods
abondrn Oct 30, 2023
1f49f9d
Adds new functional test that addresses case where there is a partial…
abondrn Oct 30, 2023
8112866
Ran linter
abondrn Oct 30, 2023
fec8796
Add capability to compute sequence features and marshal en masse
abondrn Oct 30, 2023
9ce9f4f
Add methods to convert polyjson -> genbank
abondrn Oct 30, 2023
89a2ba4
Removed generic collections library in favor of hand-rolled multimap,…
abondrn Oct 30, 2023
b88d7b8
Propogate handrolled multimap to test files
abondrn Oct 30, 2023
b4c3a37
Responded to more comments
abondrn Oct 30, 2023
8b82d7b
Reduced new example genbank file
abondrn Oct 31, 2023
f523651
Resolved lint errors, added test StoreFeatureSequences and fixed unco…
abondrn Oct 31, 2023
1270ec8
Added multimap.go file doc
abondrn Oct 31, 2023
9c322f6
Responded to more comments
abondrn Oct 31, 2023
f124fae
First merge attempt
abondrn Nov 4, 2023
98b6984
Fixed deref issue
abondrn Nov 4, 2023
fc2ca75
Merged updated branch
abondrn Nov 4, 2023
25e0f61
Fixed tests, moved genbank files
abondrn Nov 4, 2023
60abf6d
Fixed fasta docs
abondrn Nov 4, 2023
7e3c812
Added changelog
abondrn Nov 5, 2023
35a5492
Merge pull request #394 from abondrn/ioToBio-genbank
Koeng101 Nov 5, 2023
433df00
added to changelog
Koeng101 Nov 10, 2023
8fb83b8
changed
Koeng101 Dec 7, 2023
74aa431
Merge branch 'main' into ioToBio
Koeng101 Dec 7, 2023
c377f64
fix linter issues
Koeng101 Dec 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
have examples covering most of changes
Koeng101 committed Sep 5, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 65f0539c85136718b0bf573bbe860077c6e57b6a
55 changes: 10 additions & 45 deletions bio/bio.go
Original file line number Diff line number Diff line change
@@ -56,16 +56,8 @@ Lower level interfaces
******************************************************************************/

type LowLevelParser[DataType fasta.Record | slow5.Read, DataTypeHeader fasta.Header | slow5.Header] interface {
Header() (DataTypeHeader, error)
Next() (DataType, error)
}

type Record interface {
WriteTo(w io.Writer) (n int64, err error)
}

type Header interface {
WriteTo(w io.Writer) (n int64, err error)
Header() (*DataTypeHeader, error)
Next() (*DataType, error)
}

/******************************************************************************
@@ -141,16 +133,16 @@ Parser higher-level functions

******************************************************************************/

func (p *Parser[DataType, DataTypeHeader]) Next() (DataType, error) {
func (p *Parser[DataType, DataTypeHeader]) Next() (*DataType, error) {
return p.LowLevelParser.Next()
}

func (p *Parser[DataType, DataTypeHeader]) Header() (DataTypeHeader, error) {
func (p *Parser[DataType, DataTypeHeader]) Header() (*DataTypeHeader, error) {
return p.LowLevelParser.Header()
}

func (p *Parser[DataType, DataTypeHeader]) ParseN(countN int) ([]DataType, error) {
var records []DataType
func (p *Parser[DataType, DataTypeHeader]) ParseN(countN int) ([]*DataType, error) {
var records []*DataType
for counter := 0; counter < countN; counter++ {
record, err := p.Next()
if err != nil {
@@ -164,11 +156,11 @@ func (p *Parser[DataType, DataTypeHeader]) ParseN(countN int) ([]DataType, error
return records, nil
}

func (p *Parser[DataType, DataTypeHeader]) Parse() ([]DataType, error) {
func (p *Parser[DataType, DataTypeHeader]) Parse() ([]*DataType, error) {
return p.ParseN(math.MaxInt)
}

func (p *Parser[DataType, DataTypeHeader]) ParseWithHeader() ([]DataType, DataTypeHeader, error) {
func (p *Parser[DataType, DataTypeHeader]) ParseWithHeader() ([]*DataType, *DataTypeHeader, error) {
header, headerErr := p.Header()
data, err := p.Parse()
if headerErr != nil {
@@ -188,36 +180,9 @@ func (p *Parser[DataType, DataTypeHeader]) ParseToChannel(channel chan<- DataTyp
if errors.Is(err, io.EOF) {
err = nil // EOF not treated as parsing error.
}
close(channel)
return err
}
channel <- record
}
}

/******************************************************************************

Writer functions

******************************************************************************/

func WriteAll(data []io.WriterTo, header io.WriterTo, w io.Writer) error {
_, err := header.WriteTo(w)
if err != nil {
return err
}
for _, datum := range data {
_, err = datum.WriteTo(w)
if err != nil {
return err
}
}
return nil
}

func WriteFile(data []io.WriterTo, header io.WriterTo, path string) error {
file, err := os.Open(path)
if err != nil {
return err
channel <- *record
}
return WriteAll(data, header, file)
}
87 changes: 82 additions & 5 deletions bio/example_test.go
Original file line number Diff line number Diff line change
@@ -5,10 +5,10 @@ import (
"compress/gzip"
"fmt"
"io"
"log"
"strings"

"github.com/TimothyStiles/poly/bio"
"github.com/TimothyStiles/poly/bio/fasta"
"github.com/TimothyStiles/poly/bio/genbank"
"github.com/TimothyStiles/poly/bio/gff"
"github.com/TimothyStiles/poly/bio/polyjson"
@@ -134,6 +134,86 @@ DIDGDGQVNYEEFVQMMTAK*`))
// Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK*
}

func ExampleParseWithHeader() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
file := strings.NewReader(`#slow5_version 0.2.0
#num_read_groups 1
@asic_id 4175987214
#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char*
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number
0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10
`)
parserInterface, _ := bio.NewParser(bio.Slow5, file) // Make a parser with the file
parser := parserInterface.(bio.Slow5Parser)
reads, header, _ := parser.ParseWithHeader() // Parse all data records from file

fmt.Printf("%s, %s\n", header.HeaderValues[0].Slow5Version, reads[0].ReadID)
// Output: 0.2.0, 0026631e-33a3-49ab-aa22-3ab157d71f8b
}

func ExampleParseToChannel() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
file := strings.NewReader(`>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
IENY

>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken
ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
DIDGDGQVNYEEFVQMMTAK*`)
parserInterface, _ := bio.NewParser(bio.Fasta, file) // Make a parser with the file
parser := parserInterface.(bio.FastaParser)

channel := make(chan fasta.Record)
go parser.ParseToChannel(channel)

var records []fasta.Record
for record := range channel {
records = append(records, record)
}

fmt.Println(records[1].Sequence)
// Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK*
}

func ExampleWriteAll() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
file := strings.NewReader(`#slow5_version 0.2.0
#num_read_groups 1
@asic_id 4175987214
#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char*
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number
0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10
`)
parserInterface, _ := bio.NewParser(bio.Slow5, file) // Make a parser with the file
parser := parserInterface.(bio.Slow5Parser)
reads, header, _ := parser.ParseWithHeader() // Parse all data records from file

// Write the files to an io.Writer.
// All headers and all records implement io.WriterTo interfaces.
var buffer bytes.Buffer
_, _ = header.WriteTo(&buffer)
for _, read := range reads {
read.WriteTo(&buffer)
}

fmt.Println(string(buffer.Bytes()))
// Output: #slow5_version 0.2.0
//#num_read_groups 1
//@asic_id 4175987214
//#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char*
//#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number
//0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10
//
//
}

func ExampleFasta() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
@@ -166,10 +246,7 @@ func ExampleSlow5() {
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number
0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10
`)
parserInterface, err := bio.NewParser(bio.Slow5, file) // Make a parser with the file
if err != nil {
log.Fatalf("%s", err)
}
parserInterface, _ := bio.NewParser(bio.Slow5, file) // Make a parser with the file
parser := parserInterface.(bio.Slow5Parser)
reads, _ := parser.Parse() // Parse all data records from file

2 changes: 1 addition & 1 deletion bio/fasta/example_test.go
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ func Example_basic() {
}
break
}
records = append(records, record)
records = append(records, *record)
}
fmt.Println(records[0].Sequence)
// Output: ATGC
16 changes: 8 additions & 8 deletions bio/fasta/fasta.go
Original file line number Diff line number Diff line change
@@ -79,8 +79,8 @@ type Parser struct {
}

// Header returns a identifier with nothing and nil.
func (p *Parser) Header() (Header, error) {
return Header{}, nil
func (p *Parser) Header() (*Header, error) {
return &Header{}, nil
}

// NewParser returns a Parser that uses r as the source
@@ -107,9 +107,9 @@ func NewParser(r io.Reader, maxLineSize int) *Parser {
// It is worth noting the amount of bytes read are always right up to before
// the next fasta starts which means this function can effectively be used
// to index where fastas start in a file or string.
func (p *Parser) Next() (Record, error) {
func (p *Parser) Next() (*Record, error) {
if p.more == false {
return Record{}, io.EOF
return &Record{}, io.EOF
}
for p.scanner.Scan() {
line := p.scanner.Bytes()
@@ -128,7 +128,7 @@ func (p *Parser) Next() (Record, error) {
case line[0] != '>' && p.start:
err := fmt.Errorf("invalid input: missing sequence identifier for sequence starting at line %d", p.line)
record, _ := p.newRecord()
return record, err
return &record, err
// start of a fasta line
case line[0] != '>':
p.buff.Write(line)
@@ -137,7 +137,7 @@ func (p *Parser) Next() (Record, error) {
record, err := p.newRecord()
// New name
p.identifier = string(line[1:])
return record, err
return &record, err
// Process first line of file
case line[0] == '>' && p.start:
p.identifier = string(line[1:])
@@ -148,9 +148,9 @@ func (p *Parser) Next() (Record, error) {
// Add final sequence in file
record, err := p.newRecord()
if err != nil {
return record, err
return &record, err
}
return record, p.scanner.Err()
return &record, p.scanner.Err()
}

func (p *Parser) newRecord() (Record, error) {
8 changes: 4 additions & 4 deletions bio/fasta/fasta_test.go
Original file line number Diff line number Diff line change
@@ -54,7 +54,7 @@ func TestParser(t *testing.T) {
}
break
}
fastas = append(fastas, fa)
fastas = append(fastas, *fa)
}
if len(fastas) != len(test.expected) {
t.Errorf("case index %d: got %d fastas, expected %d", testIndex, len(fastas), len(test.expected))
@@ -84,7 +84,7 @@ func TestReadEmptyFasta(t *testing.T) {
targetError = err
break
}
fastas = append(fastas, fa)
fastas = append(fastas, *fa)
}
if targetError == nil {
t.Errorf("expected error reading empty fasta stream")
@@ -108,7 +108,7 @@ func TestReadEmptySequence(t *testing.T) {
targetError = err
break
}
fastas = append(fastas, fa)
fastas = append(fastas, *fa)
}
if targetError == nil {
t.Errorf("expected error reading empty fasta sequence stream: %s", targetError)
@@ -129,7 +129,7 @@ func TestBufferSmall(t *testing.T) {
targetError = err
break
}
fastas = append(fastas, fa)
fastas = append(fastas, *fa)
}
if targetError == nil {
t.Errorf("expected error with too small of a buffer")
2 changes: 1 addition & 1 deletion bio/slow5/example_test.go
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@ func ExampleNewParser() {
// Break at EOF
break
}
outputReads = append(outputReads, read)
outputReads = append(outputReads, *read)
}

fmt.Println(outputReads[0].RawSignal[0:10])
Loading