From 73deda8f7ea024cee5000b60601cd81eee456ba2 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 16:30:09 -0600 Subject: [PATCH 01/17] More consistently use encoding instead to toString --- test/format.test.ts | 29 +++++++++++++++++------------ test/parse.test.ts | 7 ++++--- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/test/format.test.ts b/test/format.test.ts index a510cfc..ed71169 100644 --- a/test/format.test.ts +++ b/test/format.test.ts @@ -13,13 +13,16 @@ describe('GFF3 formatting', () => { ;['spec_eden', 'au9_scaffold_subset', 'hybrid1', 'hybrid2'].forEach( (file) => { it(`can roundtrip ${file}.gff3 with formatSync`, () => { - const inputGFF3 = fs - .readFileSync(require.resolve(`./data/${file}.gff3`)) - .toString('utf8') + const inputGFF3 = fs.readFileSync( + require.resolve(`./data/${file}.gff3`), + 'utf8', + ) const expectedGFF3 = fs - .readFileSync(require.resolve(`./data/${file}.reformatted.gff3`)) - .toString('utf8') + .readFileSync( + require.resolve(`./data/${file}.reformatted.gff3`), + 'utf8', + ) .replace(/###\n/g, '') // formatSync does not insert sync marks const items = gff.parseStringSync(inputGFF3, { parseAll: true }) @@ -28,9 +31,10 @@ describe('GFF3 formatting', () => { }) it(`can roundtrip ${file}.gff3 with formatStream`, async () => { - const expectedGFF3 = ( - await readfile(require.resolve(`./data/${file}.reformatted.gff3`)) - ).toString('utf8') + const expectedGFF3 = await readfile( + require.resolve(`./data/${file}.reformatted.gff3`), + 'utf8', + ) const resultGFF3 = await getStream( fs @@ -63,11 +67,12 @@ describe('GFF3 formatting', () => { await gff.formatFile(gff3In, fs.createWriteStream(tmpFile.path)) await fdatasync(tmpFile.fd) - const resultGFF3 = (await readfile(tmpFile.path)).toString('utf8') + const resultGFF3 = await readfile(tmpFile.path, 'utf8') - const expectedGFF3 = ( - await readfile(require.resolve(`./data/${file}.reformatted.gff3`)) - ).toString('utf8') + const expectedGFF3 = await readfile( + require.resolve(`./data/${file}.reformatted.gff3`), + 'utf8', + ) expect(resultGFF3).toEqual(expectedGFF3) }) diff --git a/test/parse.test.ts b/test/parse.test.ts index 00e57af..35997e2 100644 --- a/test/parse.test.ts +++ b/test/parse.test.ts @@ -168,9 +168,10 @@ describe('GFF3 parser', () => { ) it('can parse a string synchronously', () => { - const gff3 = fs - .readFileSync(require.resolve('./data/spec_eden.gff3')) - .toString('utf8') + const gff3 = fs.readFileSync( + require.resolve('./data/spec_eden.gff3'), + 'utf8', + ) const result = gff.parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, From 93b34788d61910b31cfa7fbd2ad13016bb7aa89b Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 16:42:01 -0600 Subject: [PATCH 02/17] Use more promises over sync in tests --- package.json | 5 ++++- test/format.test.ts | 37 ++++++++++++++++++++----------------- test/parse.test.ts | 22 ++++++++++++++-------- yarn.lock | 8 ++++---- 4 files changed, 42 insertions(+), 30 deletions(-) diff --git a/package.json b/package.json index 1dd38eb..d41b6fe 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,9 @@ "bin": { "gff-to-json": "dist/gff-to-json.js" }, + "engines": { + "node": ">=10" + }, "files": [ "dist", "esm", @@ -50,7 +53,7 @@ }, "devDependencies": { "@types/jest": "^29.2.4", - "@types/node": "^18.11.16", + "@types/node": "^10.17.60", "@typescript-eslint/eslint-plugin": "^5.46.1", "@typescript-eslint/parser": "^5.46.1", "documentation": "^14.0.1", diff --git a/test/format.test.ts b/test/format.test.ts index ed71169..07fb94b 100644 --- a/test/format.test.ts +++ b/test/format.test.ts @@ -1,4 +1,9 @@ -import fs from 'fs' +import { + createReadStream, + createWriteStream, + fdatasync as fdatasyncC, + promises as fsPromises, +} from 'fs' import tmp from 'tmp-promise' import { promisify } from 'util' @@ -6,24 +11,23 @@ import getStream from 'get-stream' import gff from '../src' -const readfile = promisify(fs.readFile) -const fdatasync = promisify(fs.fdatasync) +const fdatasync = promisify(fdatasyncC) describe('GFF3 formatting', () => { ;['spec_eden', 'au9_scaffold_subset', 'hybrid1', 'hybrid2'].forEach( (file) => { - it(`can roundtrip ${file}.gff3 with formatSync`, () => { - const inputGFF3 = fs.readFileSync( + it(`can roundtrip ${file}.gff3 with formatSync`, async () => { + const inputGFF3 = await fsPromises.readFile( require.resolve(`./data/${file}.gff3`), 'utf8', ) - const expectedGFF3 = fs - .readFileSync( + const expectedGFF3 = ( + await fsPromises.readFile( require.resolve(`./data/${file}.reformatted.gff3`), 'utf8', ) - .replace(/###\n/g, '') // formatSync does not insert sync marks + ).replace(/###\n/g, '') // formatSync does not insert sync marks const items = gff.parseStringSync(inputGFF3, { parseAll: true }) const resultGFF3 = gff.formatSync(items) @@ -31,14 +35,13 @@ describe('GFF3 formatting', () => { }) it(`can roundtrip ${file}.gff3 with formatStream`, async () => { - const expectedGFF3 = await readfile( + const expectedGFF3 = await fsPromises.readFile( require.resolve(`./data/${file}.reformatted.gff3`), 'utf8', ) const resultGFF3 = await getStream( - fs - .createReadStream(require.resolve(`./data/${file}.gff3`)) + createReadStream(require.resolve(`./data/${file}.gff3`)) .pipe( gff.parseStream({ parseFeatures: true, @@ -60,16 +63,16 @@ describe('GFF3 formatting', () => { it(`can roundtrip ${file}.gff3 with formatFile`, async () => { jest.setTimeout(1000) await tmp.withFile(async (tmpFile) => { - const gff3In = fs - .createReadStream(require.resolve(`./data/${file}.gff3`)) - .pipe(gff.parseStream({ parseAll: true })) + const gff3In = createReadStream( + require.resolve(`./data/${file}.gff3`), + ).pipe(gff.parseStream({ parseAll: true })) - await gff.formatFile(gff3In, fs.createWriteStream(tmpFile.path)) + await gff.formatFile(gff3In, createWriteStream(tmpFile.path)) await fdatasync(tmpFile.fd) - const resultGFF3 = await readfile(tmpFile.path, 'utf8') + const resultGFF3 = await fsPromises.readFile(tmpFile.path, 'utf8') - const expectedGFF3 = await readfile( + const expectedGFF3 = await fsPromises.readFile( require.resolve(`./data/${file}.reformatted.gff3`), 'utf8', ) diff --git a/test/parse.test.ts b/test/parse.test.ts index 35997e2..da89284 100644 --- a/test/parse.test.ts +++ b/test/parse.test.ts @@ -1,4 +1,4 @@ -import fs from 'fs' +import { createReadStream, promises as fsPromises } from 'fs' import gff from '../src' import { formatFeature, @@ -30,7 +30,7 @@ function readAll( } // $p->max_lookback(1) - fs.createReadStream(require.resolve(filename)) + createReadStream(require.resolve(filename)) .pipe( gff.parseStream({ parseFeatures: true, @@ -59,7 +59,7 @@ describe('GFF3 parser', () => { it('can parse gff3_with_syncs.gff3', async () => { const stuff = await readAll('./data/gff3_with_syncs.gff3') const referenceResult = JSON.parse( - fs.readFileSync( + await fsPromises.readFile( require.resolve('./data/gff3_with_syncs.result.json'), 'utf8', ), @@ -92,7 +92,7 @@ describe('GFF3 parser', () => { const stuff = await readAll('./data/knownGene_out_of_order.gff3') // $p->max_lookback(2); const expectedOutput = JSON.parse( - fs.readFileSync( + await fsPromises.readFile( require.resolve('./data/knownGene_out_of_order.result.json'), 'utf8', ), @@ -133,7 +133,10 @@ describe('GFF3 parser', () => { expect(mrnaLines[2].child_features).toHaveLength(6) const referenceResult = JSON.parse( - fs.readFileSync(require.resolve('./data/spec_eden.result.json'), 'utf8'), + await fsPromises.readFile( + require.resolve('./data/spec_eden.result.json'), + 'utf8', + ), ) expect(stuff.all).toEqual(referenceResult) }) @@ -167,8 +170,8 @@ describe('GFF3 parser', () => { }, ) - it('can parse a string synchronously', () => { - const gff3 = fs.readFileSync( + it('can parse a string synchronously', async () => { + const gff3 = await fsPromises.readFile( require.resolve('./data/spec_eden.gff3'), 'utf8', ) @@ -179,7 +182,10 @@ describe('GFF3 parser', () => { }) expect(result).toHaveLength(3) const referenceResult = JSON.parse( - fs.readFileSync(require.resolve('./data/spec_eden.result.json'), 'utf8'), + await fsPromises.readFile( + require.resolve('./data/spec_eden.result.json'), + 'utf8', + ), ) expect(result).toEqual(referenceResult) }) diff --git a/yarn.lock b/yarn.lock index 55419fd..382b453 100644 --- a/yarn.lock +++ b/yarn.lock @@ -949,10 +949,10 @@ resolved "https://registry.yarnpkg.com/@types/node/-/node-17.0.23.tgz#3b41a6e643589ac6442bdbd7a4a3ded62f33f7da" integrity sha512-UxDxWn7dl97rKVeVS61vErvw086aCYhDLyvRQZ5Rk65rZKepaFdm53GeqXaKBuOhED4e9uWq34IC3TdSdJJ2Gw== -"@types/node@^18.11.16": - version "18.11.16" - resolved "https://registry.yarnpkg.com/@types/node/-/node-18.11.16.tgz#966cae211e970199559cfbd295888fca189e49af" - integrity sha512-6T7P5bDkRhqRxrQtwj7vru+bWTpelgtcETAZEUSdq0YISKz8WKdoBukQLYQQ6DFHvU9JRsbFq0JH5C51X2ZdnA== +"@types/node@^10.17.60": + version "10.17.60" + resolved "https://registry.yarnpkg.com/@types/node/-/node-10.17.60.tgz#35f3d6213daed95da7f0f73e75bcc6980e90597b" + integrity sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw== "@types/normalize-package-data@^2.4.1": version "2.4.1" From 61e76250a64e8ef623328af0b6b9cbb15f6e287f Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 16:55:49 -0600 Subject: [PATCH 03/17] Re-flow comments --- README.md | 13 +++++-------- src/api.ts | 12 ++++++------ src/parse.ts | 27 ++++++++++++--------------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 4a5dfe6..9215895 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,7 @@ import gff from '@gmod/gff' const fs = require('fs') -// parse a file from a file name -// parses only features and sequences by default, +// parse a file from a file name. parses only features and sequences by default, // set options to parse directives and/or comments fs.createReadStream('path/to/my/file.gff3') .pipe(gff.parseStream({ parseAll: true })) @@ -54,16 +53,14 @@ const arrayOfThings = gff.parseStringSync(stringOfGFF3) // format an array of items to a string const newStringOfGFF3 = gff.formatSync(arrayOfThings) -// format a stream of things to a stream of text. -// inserts sync marks automatically. +// format a stream of things to a stream of text. inserts sync marks +// automatically. myStreamOfGFF3Objects .pipe(gff.formatStream()) .pipe(fs.createWriteStream('my_new.gff3')) -// format a stream of things and write it to -// a gff3 file. inserts sync marks and a -// '##gff-version 3' header if one is not -// already present +// format a stream of things and write it to a gff3 file. inserts sync marks and +// a '##gff-version 3' header if one is not already present gff.formatFile( myStreamOfGFF3Objects, fs.createWriteStream('my_new_2.gff3', { encoding: 'utf8' }), diff --git a/src/api.ts b/src/api.ts index f4b32c9..a1b571b 100644 --- a/src/api.ts +++ b/src/api.ts @@ -546,8 +546,8 @@ class FormattingTransform extends Transform { _encoding: BufferEncoding, callback: TransformCallback, ) { - // if we have not emitted anything yet, and this first - // chunk is not a gff-version directive, emit one + // if we have not emitted anything yet, and this first chunk is not a + // gff-version directive, emit one let str if (!this.haveWeEmittedData && this.insertVersionDirective) { const thisChunk = Array.isArray(chunk) ? chunk[0] : chunk @@ -558,8 +558,8 @@ class FormattingTransform extends Transform { } } - // if it's a sequence chunk coming down, emit a FASTA directive and - // change to FASTA mode + // if it's a sequence chunk coming down, emit a FASTA directive and change + // to FASTA mode if ('sequence' in chunk && !this.fastaMode) { this.push('##FASTA\n') this.fastaMode = true @@ -603,8 +603,8 @@ export function formatStream(options: FormatOptions = {}): FormattingTransform { * Format a stream of features, directives, comments and/or sequences into a * GFF3 file and write it to the filesystem. - * Inserts synchronization (###) marks and a ##gff-version - * directive automatically (if one is not already present). + * Inserts synchronization (###) marks and a ##gff-version directive + * automatically (if one is not already present). * * @param stream - the stream to write to the file * @param filename - the file path to write to diff --git a/src/parse.ts b/src/parse.ts index abeb4db..1b0ecc7 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -62,13 +62,12 @@ export default class Parser { sequenceCallback: (sequence: GFF3.GFF3Sequence) => void bufferSize: number fastaParser: FASTAParser | undefined = undefined - // if this is true, the parser ignores the - // rest of the lines in the file. currently - // set when the file switches over to FASTA + // if this is true, the parser ignores the rest of the lines in the file. + // currently set when the file switches over to FASTA eof = false lineNumber = 0 - // features that we have to keep on hand for now because they - // might be referenced by something else + // features that we have to keep on hand for now because they might be + // referenced by something else private _underConstructionTopLevel: GFF3.GFF3Feature[] = [] // index of the above by ID private _underConstructionById: Record = @@ -77,8 +76,7 @@ export default class Parser { string, Record | undefined > = {} - // features that reference something we have not seen yet - // structured as: + // features that reference something we have not seen yet. structured as: // { 'some_id' : { // 'Parent' : [ orphans that have a Parent attr referencing it ], // 'Derives_from' : [ orphans that have a Derives_from attr referencing it ], @@ -211,8 +209,8 @@ export default class Parser { } /** - * return all under-construction features, called when we know - * there will be no additional data to attach to them + * return all under-construction features, called when we know there will be + * no additional data to attach to them */ private _emitAllUnderConstructionFeatures() { this._underConstructionTopLevel.forEach(this._emitItem.bind(this)) @@ -221,8 +219,8 @@ export default class Parser { this._underConstructionById = {} this._completedReferences = {} - // if we have any orphans hanging around still, this is a - // problem. die with a parse error + // if we have any orphans hanging around still, this is a problem. die with + // a parse error if (Array.from(Object.values(this._underConstructionOrphans)).length) { throw new Error( `some features reference other features that do not exist in the file (or in the same '###' scope). ${Object.keys( @@ -250,8 +248,7 @@ export default class Parser { : featureLine.attributes?.Derives_from || [] if (!ids.length && !parents.length && !derives.length) { - // if it has no IDs and does not refer to anything, we can just - // output it + // if it has no IDs and does not refer to anything, we can just output it this._emitItem([featureLine]) return } @@ -271,8 +268,8 @@ export default class Parser { existing.push(featureLine) feature = existing } else { - // haven't seen it yet, so buffer it so we can attach - // child features to it + // haven't seen it yet, so buffer it so we can attach child features to + // it feature = [featureLine] this._enforceBufferSizeLimit(1) From 59d051bd2bd3ff8ec2608caf1e8ae543286dfb9d Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 16:59:01 -0600 Subject: [PATCH 04/17] Use more thorough line ending regexp --- src/api.ts | 2 +- src/parse.ts | 2 +- src/util.ts | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api.ts b/src/api.ts index a1b571b..4ba1d66 100644 --- a/src/api.ts +++ b/src/api.ts @@ -494,7 +494,7 @@ export function parseStringSync( }, }) - str.split(/\r?\n/).forEach(parser.addLine.bind(parser)) + str.split(/\r\n|[\r\n]/).forEach(parser.addLine.bind(parser)) parser.finish() return items diff --git a/src/parse.ts b/src/parse.ts index 1b0ecc7..3ef1903 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -154,7 +154,7 @@ export default class Parser { this.fastaParser.addLine(line) } else { // it's a parse error - const errLine = line.replace(/\r?\n?$/g, '') + const errLine = line.replace(/\r\n|[\r\n]$/g, '') throw new Error(`GFF3 parse error. Cannot parse '${errLine}'.`) } } diff --git a/src/util.ts b/src/util.ts index 239e6f6..8e11865 100644 --- a/src/util.ts +++ b/src/util.ts @@ -52,7 +52,7 @@ export function parseAttributes(attrString: string): GFF3Attributes { const attrs: GFF3Attributes = {} attrString - .replace(/\r?\n$/, '') + .replace(/\r\n|[\r\n]$/, '') .split(';') .forEach((a) => { const nv = a.split('=', 2) @@ -121,7 +121,7 @@ export function parseDirective( const parsed: GFF3Directive = { directive: name } if (contents.length) { - contents = contents.replace(/\r?\n$/, '') + contents = contents.replace(/\r\n|[\r\n]$/, '') parsed.value = contents } From cd334534ef347007834bdd65e328bf83d68f154f Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 17:10:50 -0600 Subject: [PATCH 05/17] Better TypeScript types on formatItem --- src/api.ts | 6 ++++-- src/util.ts | 33 ++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/api.ts b/src/api.ts index 4ba1d66..b4f7b41 100644 --- a/src/api.ts +++ b/src/api.ts @@ -515,7 +515,9 @@ export function formatSync(items: GFF3Item[]): string { if ('sequence' in i) sequences.push(i) else other.push(i) }) - let str = other.map(formatItem).join('') + let str = other + .map((o) => (Array.isArray(o) ? formatItem(o).join('') : formatItem(o))) + .join('') if (sequences.length) { str += '##FASTA\n' str += sequences.map(formatSequence).join('') @@ -565,7 +567,7 @@ class FormattingTransform extends Transform { this.fastaMode = true } - if (Array.isArray(chunk)) str = chunk.map(formatItem).join('') + if (Array.isArray(chunk)) str = chunk.map((c) => formatItem(c)).join('') else str = formatItem(chunk) this.push(str) diff --git a/src/util.ts b/src/util.ts index 8e11865..c899e68 100644 --- a/src/util.ts +++ b/src/util.ts @@ -277,6 +277,16 @@ export function formatSequence(seq: GFF3Sequence): string { }\n` } +function formatSingleItem( + item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence, +) { + if ('attributes' in item) return formatFeature(item) + if ('directive' in item) return formatDirective(item) + if ('sequence' in item) return formatSequence(item) + if ('comment' in item) return formatComment(item) + return '# (invalid item found during format)\n' +} + /** * Format a directive, comment, sequence, or feature, or array of such items, * into one or more lines of GFF3. @@ -284,6 +294,17 @@ export function formatSequence(seq: GFF3Sequence): string { * @param itemOrItems - A comment, sequence, or feature, or array of such items * @returns A formatted string or array of strings */ +export function formatItem( + item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence, +): string +export function formatItem( + items: ( + | GFF3FeatureLineWithRefs + | GFF3Directive + | GFF3Comment + | GFF3Sequence + )[], +): string[] export function formatItem( itemOrItems: | GFF3FeatureLineWithRefs @@ -291,17 +312,7 @@ export function formatItem( | GFF3Comment | GFF3Sequence | (GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence)[], -): string | string[] { - function formatSingleItem( - item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence, - ) { - if ('attributes' in item) return formatFeature(item) - if ('directive' in item) return formatDirective(item) - if ('sequence' in item) return formatSequence(item) - if ('comment' in item) return formatComment(item) - return '# (invalid item found during format)\n' - } - +) { if (Array.isArray(itemOrItems)) { return itemOrItems.map(formatSingleItem) } From 7ac07524be003fb253d1d8d5ba10179f918a1f12 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 17:19:18 -0600 Subject: [PATCH 06/17] Prefer TS private over _ prefix for private fields --- src/api.ts | 12 +++---- src/parse.ts | 100 +++++++++++++++++++++++++-------------------------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/api.ts b/src/api.ts index b4f7b41..1e99caf 100644 --- a/src/api.ts +++ b/src/api.ts @@ -105,17 +105,17 @@ class GFFTransform extends Transform { }) } - private _addLine(data: string | undefined) { + private addLine(data: string | undefined) { if (data) { this.parser.addLine(data) } } - private _nextText(buffer: string) { + private nextText(buffer: string) { const pieces = (this.textBuffer + buffer).split(/\r?\n/) this.textBuffer = pieces.pop() || '' - pieces.forEach((piece) => this._addLine(piece)) + pieces.forEach((piece) => this.addLine(piece)) } _transform( @@ -123,13 +123,13 @@ class GFFTransform extends Transform { _encoding: BufferEncoding, callback: TransformCallback, ) { - this._nextText(this.decoder.write(chunk)) + this.nextText(this.decoder.write(chunk)) _callback(callback) } _flush(callback: TransformCallback) { - if (this.decoder.end) this._nextText(this.decoder.end()) - if (this.textBuffer != null) this._addLine(this.textBuffer) + if (this.decoder.end) this.nextText(this.decoder.end()) + if (this.textBuffer != null) this.addLine(this.textBuffer) this.parser.finish() _callback(callback) } diff --git a/src/parse.ts b/src/parse.ts index 3ef1903..59ca4f7 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -19,7 +19,7 @@ export class FASTAParser { addLine(line: string): void { const defMatch = /^>\s*(\S+)\s*(.*)/.exec(line) if (defMatch) { - this._flush() + this.flush() this.currentSequence = { id: defMatch[1], sequence: '' } if (defMatch[2]) this.currentSequence.description = defMatch[2].trim() } else if (this.currentSequence && /\S/.test(line)) { @@ -27,12 +27,12 @@ export class FASTAParser { } } - private _flush() { + private flush() { if (this.currentSequence) this.seqCallback(this.currentSequence) } finish(): void { - this._flush() + this.flush() } } @@ -68,11 +68,11 @@ export default class Parser { lineNumber = 0 // features that we have to keep on hand for now because they might be // referenced by something else - private _underConstructionTopLevel: GFF3.GFF3Feature[] = [] + private underConstructionTopLevel: GFF3.GFF3Feature[] = [] // index of the above by ID - private _underConstructionById: Record = + private underConstructionById: Record = {} - private _completedReferences: Record< + private completedReferences: Record< string, Record | undefined > = {} @@ -82,7 +82,7 @@ export default class Parser { // 'Derives_from' : [ orphans that have a Derives_from attr referencing it ], // } // } - private _underConstructionOrphans: Record = {} + private underConstructionOrphans: Record = {} constructor(args: ParserArgs) { // eslint-disable-next-line @typescript-eslint/no-empty-function @@ -116,7 +116,7 @@ export default class Parser { if (/^\s*[^#\s>]/.test(line)) { // feature line, most common case - this._bufferLine(line) + this.bufferLine(line) return } @@ -128,27 +128,27 @@ export default class Parser { if (hashsigns.length === 3) { // sync directive, all forward-references are resolved. - this._emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures() } else if (hashsigns.length === 2) { const directive = GFF3.parseDirective(line) if (directive) { if (directive.directive === 'FASTA') { - this._emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures() this.eof = true this.fastaParser = new FASTAParser(this.sequenceCallback) } else { - this._emitItem(directive) + this.emitItem(directive) } } } else { contents = contents.replace(/\s*/, '') - this._emitItem({ comment: contents }) + this.emitItem({ comment: contents }) } } else if (/^\s*$/.test(line)) { // blank line, do nothing } else if (/^\s*>/.test(line)) { // implicit beginning of a FASTA section - this._emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures() this.eof = true this.fastaParser = new FASTAParser(this.sequenceCallback) this.fastaParser.addLine(line) @@ -160,12 +160,12 @@ export default class Parser { } finish(): void { - this._emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures() if (this.fastaParser) this.fastaParser.finish() this.endCallback() } - private _emitItem( + private emitItem( i: GFF3.GFF3Feature | GFF3.GFF3Directive | GFF3.GFF3Comment, ) { if (Array.isArray(i)) this.featureCallback(i) @@ -173,7 +173,7 @@ export default class Parser { else if ('comment' in i) this.commentCallback(i) } - private _enforceBufferSizeLimit(additionalItemCount = 0) { + private enforceBufferSizeLimit(additionalItemCount = 0) { const _unbufferItem = (item?: GFF3.GFF3Feature) => { if ( item && @@ -184,8 +184,8 @@ export default class Parser { ) { const ids = item[0].attributes.ID ids.forEach((id) => { - delete this._underConstructionById[id] - delete this._completedReferences[id] + delete this.underConstructionById[id] + delete this.completedReferences[id] }) item.forEach((i) => { if (i.child_features) @@ -197,12 +197,12 @@ export default class Parser { } while ( - this._underConstructionTopLevel.length + additionalItemCount > + this.underConstructionTopLevel.length + additionalItemCount > this.bufferSize ) { - const item = this._underConstructionTopLevel.shift() + const item = this.underConstructionTopLevel.shift() if (item) { - this._emitItem(item) + this.emitItem(item) _unbufferItem(item) } } @@ -212,26 +212,26 @@ export default class Parser { * return all under-construction features, called when we know there will be * no additional data to attach to them */ - private _emitAllUnderConstructionFeatures() { - this._underConstructionTopLevel.forEach(this._emitItem.bind(this)) + private emitAllUnderConstructionFeatures() { + this.underConstructionTopLevel.forEach(this.emitItem.bind(this)) - this._underConstructionTopLevel = [] - this._underConstructionById = {} - this._completedReferences = {} + this.underConstructionTopLevel = [] + this.underConstructionById = {} + this.completedReferences = {} // if we have any orphans hanging around still, this is a problem. die with // a parse error - if (Array.from(Object.values(this._underConstructionOrphans)).length) { + if (Array.from(Object.values(this.underConstructionOrphans)).length) { throw new Error( `some features reference other features that do not exist in the file (or in the same '###' scope). ${Object.keys( - this._underConstructionOrphans, + this.underConstructionOrphans, )}`, ) } } // do the right thing with a newly-parsed feature line - private _bufferLine(line: string) { + private bufferLine(line: string) { const rawFeatureLine = GFF3.parseFeature(line) const featureLine: GFF3.GFF3FeatureLineWithRefs = { ...rawFeatureLine, @@ -249,17 +249,17 @@ export default class Parser { if (!ids.length && !parents.length && !derives.length) { // if it has no IDs and does not refer to anything, we can just output it - this._emitItem([featureLine]) + this.emitItem([featureLine]) return } let feature: GFF3.GFF3Feature | undefined = undefined ids.forEach((id) => { - const existing = this._underConstructionById[id] + const existing = this.underConstructionById[id] if (existing) { // another location of the same feature if (existing[existing.length - 1].type !== featureLine.type) { - this._parseError( + this.parseError( `multi-line feature "${id}" has inconsistent types: "${ featureLine.type }", "${existing[existing.length - 1].type}"`, @@ -272,27 +272,27 @@ export default class Parser { // it feature = [featureLine] - this._enforceBufferSizeLimit(1) + this.enforceBufferSizeLimit(1) if (!parents.length && !derives.length) { - this._underConstructionTopLevel.push(feature) + this.underConstructionTopLevel.push(feature) } - this._underConstructionById[id] = feature + this.underConstructionById[id] = feature // see if we have anything buffered that refers to it - this._resolveReferencesTo(feature, id) + this.resolveReferencesTo(feature, id) } }) // try to resolve all its references - this._resolveReferencesFrom( + this.resolveReferencesFrom( feature || [featureLine], { Parent: parents, Derives_from: derives }, ids, ) } - private _resolveReferencesTo(feature: GFF3.GFF3Feature, id: string) { - const references = this._underConstructionOrphans[id] + private resolveReferencesTo(feature: GFF3.GFF3Feature, id: string) { + const references = this.underConstructionOrphans[id] // references is of the form // { // 'Parent' : [ orphans that have a Parent attr referencing this feature ], @@ -305,15 +305,15 @@ export default class Parser { feature.forEach((loc) => { loc.derived_features.push(...references.Derives_from) }) - delete this._underConstructionOrphans[id] + delete this.underConstructionOrphans[id] } - private _parseError(message: string) { + private parseError(message: string) { this.eof = true this.errorCallback(`${this.lineNumber}: ${message}`) } - private _resolveReferencesFrom( + private resolveReferencesFrom( feature: GFF3.GFF3Feature, references: { Parent: string[]; Derives_from: string[] }, ids: string[], @@ -335,12 +335,12 @@ export default class Parser { } references.Parent.forEach((toId) => { - const otherFeature = this._underConstructionById[toId] + const otherFeature = this.underConstructionById[toId] if (otherFeature) { const pname = containerAttributes.Parent if ( !ids.filter((id) => - postSet(this._completedReferences, id, `Parent,${toId}`), + postSet(this.completedReferences, id, `Parent,${toId}`), ).length ) { otherFeature.forEach((location) => { @@ -348,25 +348,25 @@ export default class Parser { }) } } else { - let ref = this._underConstructionOrphans[toId] + let ref = this.underConstructionOrphans[toId] if (!ref) { ref = { Parent: [], Derives_from: [], } - this._underConstructionOrphans[toId] = ref + this.underConstructionOrphans[toId] = ref } ref.Parent.push(feature) } }) references.Derives_from.forEach((toId) => { - const otherFeature = this._underConstructionById[toId] + const otherFeature = this.underConstructionById[toId] if (otherFeature) { const pname = containerAttributes.Derives_from if ( !ids.filter((id) => - postSet(this._completedReferences, id, `Derives_from,${toId}`), + postSet(this.completedReferences, id, `Derives_from,${toId}`), ).length ) { otherFeature.forEach((location) => { @@ -374,13 +374,13 @@ export default class Parser { }) } } else { - let ref = this._underConstructionOrphans[toId] + let ref = this.underConstructionOrphans[toId] if (!ref) { ref = { Parent: [], Derives_from: [], } - this._underConstructionOrphans[toId] = ref + this.underConstructionOrphans[toId] = ref } ref.Derives_from.push(feature) } From f3e27f8d2431208c9595e424791de5c5dea8118a Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 28 Mar 2023 22:20:25 -0600 Subject: [PATCH 07/17] Move to web streams, limit to node >=16 --- .eslintignore | 8 + .github/workflows/pull_request.yml | 12 +- .github/workflows/push.yml | 6 +- README.md | 198 ++++++++++++++------- package.json | 15 +- src/api.ts | 270 +++++++++++++++-------------- src/gff-to-json.ts | 23 --- src/index.ts | 36 ++-- src/parse.ts | 145 ++++++++-------- test/format.test.ts | 54 +++--- test/parse.test.ts | 87 ++++------ test/util.ts | 46 +++++ tsconfig.json | 4 +- yarn.lock | 135 ++++++++++----- 14 files changed, 595 insertions(+), 444 deletions(-) create mode 100644 .eslintignore delete mode 100644 src/gff-to-json.ts create mode 100644 test/util.ts diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 0000000..5911fc2 --- /dev/null +++ b/.eslintignore @@ -0,0 +1,8 @@ +.DS_Store +node_modules +coverage +dist +*.log +.vscode/ +esm +.eslintcache diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index ede3d8a..f94e6e8 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -4,27 +4,27 @@ on: pull_request jobs: lint: - name: Lint on node 14 and ubuntu-latest + name: Lint on node 16 and ubuntu-latest runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Use Node.js 14 + - name: Use Node.js 16 uses: actions/setup-node@v1 with: - node-version: '14' + node-version: '16' - name: Install deps and build (with cache) uses: bahmutov/npm-install@v1 - name: Lint codebase run: yarn lint test: - name: Test and lint on node 14.x and ubuntu-latest + name: Test and lint on node 16.x and ubuntu-latest runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Use Node.js 14.x + - name: Use Node.js 16.x uses: actions/setup-node@v2 with: - node-version: '14' + node-version: '16' - name: Install deps (with cache) uses: bahmutov/npm-install@v1 - name: Test codebase diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index f8b5900..6a937ef 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -4,14 +4,14 @@ on: push jobs: test: - name: Test and lint on node 14.x and ubuntu-latest + name: Test and lint on node 16.x and ubuntu-latest runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Use Node.js 14.x + - name: Use Node.js 16.x uses: actions/setup-node@v2 with: - node-version: '14' + node-version: '16' - name: Install deps (with cache) uses: bahmutov/npm-install@v1 - name: Test codebase diff --git a/README.md b/README.md index 9215895..df93468 100644 --- a/README.md +++ b/README.md @@ -17,54 +17,109 @@ specification](https://github.com/The-Sequence-Ontology/Specifications/blob/mast with `disableDerivesFromReferences`) - only compatible with GFF3 +## Compatability + +Works in the browser and with Node.js v16 and up. + ## Install $ npm install --save @gmod/gff ## Usage -```js -const gff = require('@gmod/gff').default -// or in ES6 (recommended) -import gff from '@gmod/gff' +### Node.js example -const fs = require('fs') +```js +import { + createReadStream, + createWriteStream, + readFileSync, + writeFileSync, +} from 'fs' +// Readable.toWeb and Writable.toWeb are only available in Node.js v18 and up +// in Node.js 16, you'll have to provide your own stream source and sink +import { Readable, Writable } from 'stream' +// TransformStream is available without importing in Node.js v18 and up +import { TransformStream } from 'stream/web' +import { + formatSync, + parseStringSync, + GFFTransformer, + GFFFormattingTransformer, +} from '@gmod/gff' // parse a file from a file name. parses only features and sequences by default, // set options to parse directives and/or comments -fs.createReadStream('path/to/my/file.gff3') - .pipe(gff.parseStream({ parseAll: true })) - .on('data', (data) => { - if (data.directive) { +;(async () => { + const readStream = createReadStream('/path/to/my/file.gff3') + const streamOfGFF3 = Readable.toWeb(readStream).pipeThrough( + new TransformStream(new GFFTransformer({ parseAll: true })), + ) + for await (const data of streamOfGFF3) { + if ('directive' in data) { + console.log('got a directive', data) + } else if ('comment' in data) { + console.log('got a comment', data) + } else if ('sequence' in data) { + console.log('got a sequence from a FASTA section') + } else { + console.log('got a feature', data) + } + } + + // parse a string of gff3 synchronously + const stringOfGFF3 = readFileSync('/path/to/my/file.gff3', 'utf8') + const arrayOfGFF3ITems = parseStringSync(stringOfGFF3) + + // format an array of items to a string + const newStringOfGFF3 = formatSync(arrayOfGFF3ITems) + writeFileSync('/path/to/new/file.gff3', newStringOfGFF3) + + // read a file, format it, and write it to a new file. inserts sync marks and + // a '##gff-version 3' header if one is not already present + await Readable.toWeb(createReadStream('/path/to/my/file.gff3')) + .pipeThrough(new TransformStream(new GFFTransformer({ parseAll: true }))) + .pipeThrough(new TransformStream(new GFFFormattingTransformer())) + .pipeTo(Writable.toWeb(createWriteStream('/path/to/my/file.gff3'))) +})() +``` + +### Browser example + +```js +import { GFFTransformer } from '@gmod/gff' + +// parse a file from a URL. parses only features and sequences by default, set +// options to parse directives and/or comments +;(async () => { + const response = await fetch('http://example.com/file.gff3') + if (!response.ok) { + throw new Error('Bad response') + } + if (!response.body) { + throw new Error('No response body') + } + const reader = response.body + .pipeThrough(new TransformStream(new GFFTransformer({ parseAll: true }))) + .getReader() + let result + do { + result = await reader.read() + if (result.done) { + continue + } + const data = result.value + if ('directive' in data) { console.log('got a directive', data) - } else if (data.comment) { + } else if ('comment' in data) { console.log('got a comment', data) - } else if (data.sequence) { + } else if ('sequence' in data) { console.log('got a sequence from a FASTA section') } else { console.log('got a feature', data) } - }) - -// parse a string of gff3 synchronously -const stringOfGFF3 = fs.readFileSync('my_annotations.gff3').toString() -const arrayOfThings = gff.parseStringSync(stringOfGFF3) - -// format an array of items to a string -const newStringOfGFF3 = gff.formatSync(arrayOfThings) - -// format a stream of things to a stream of text. inserts sync marks -// automatically. -myStreamOfGFF3Objects - .pipe(gff.formatStream()) - .pipe(fs.createWriteStream('my_new.gff3')) - -// format a stream of things and write it to a gff3 file. inserts sync marks and -// a '##gff-version 3' header if one is not already present -gff.formatFile( - myStreamOfGFF3Objects, - fs.createWriteStream('my_new_2.gff3', { encoding: 'utf8' }), -) + } while (!result.done) +})() ``` ## Object format @@ -202,16 +257,21 @@ ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA`)[ - [parseSequences](#parsesequences) - [parseAll](#parseall) - [bufferSize](#buffersize) -- [parseStream](#parsestream) +- [GFFTransformer](#gfftransformer) - [Parameters](#parameters) -- [parseStringSync](#parsestringsync) +- [parseStream](#parsestream) - [Parameters](#parameters-1) -- [formatSync](#formatsync) +- [parseStringSync](#parsestringsync) - [Parameters](#parameters-2) -- [formatStream](#formatstream) +- [formatSync](#formatsync) - [Parameters](#parameters-3) -- [formatFile](#formatfile) +- [FormatOptions](#formatoptions) + - [minSyncLines](#minsynclines) + - [insertVersionDirective](#insertversiondirective) +- [GFFFormattingTransformer](#gffformattingtransformer) - [Parameters](#parameters-4) +- [formatStream](#formatstream) + - [Parameters](#parameters-5) ### ParseOptions @@ -266,6 +326,15 @@ Maximum number of GFF3 lines to buffer, default 1000 Type: [number](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number) +### GFFTransformer + +Parse a stream of text data into a stream of feature, directive, comment, +an sequence objects. + +#### Parameters + +- `options` **[ParseOptions](#parseoptions)** Parser options (optional, default `{}`) + ### parseStream Parse a stream of text data into a stream of feature, directive, comment, @@ -275,7 +344,7 @@ an sequence objects. - `options` **[ParseOptions](#parseoptions)** Parsing options (optional, default `{}`) -Returns **GFFTransform** stream (in objectMode) of parsed items +Returns **[GFFTransformer](#gfftransformer)** stream (in objectMode) of parsed items ### parseStringSync @@ -300,51 +369,59 @@ GFF3. Does not insert synchronization (###) marks. Returns **[string](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String)** the formatted GFF3 -### formatStream +### FormatOptions -Format a stream of features, directives, comments and/or sequences into a +Formatter options + +#### minSyncLines + +The minimum number of lines to emit between sync (###) directives, default +100 + +Type: [number](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number) + +#### insertVersionDirective + +Whether to insert a version directive at the beginning of a formatted +stream if one does not exist already, default true + +Type: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean) + +### GFFFormattingTransformer + +Transform a stream of features, directives, comments and/or sequences into a stream of GFF3 text. Inserts synchronization (###) marks automatically. #### Parameters -- `options` **FormatOptions** parser options (optional, default `{}`) - -Returns **FormattingTransform** +- `options` **[FormatOptions](#formatoptions)** Formatter options (optional, default `{}`) -### formatFile +### formatStream Format a stream of features, directives, comments and/or sequences into a -GFF3 file and write it to the filesystem. +stream of GFF3 text. -Inserts synchronization (###) marks and a ##gff-version -directive automatically (if one is not already present). +Inserts synchronization (###) marks automatically. #### Parameters -- `stream` **Readable** the stream to write to the file -- `writeStream` **Writable** -- `options` **FormatOptions** parser options (optional, default `{}`) -- `filename` the file path to write to +- `options` **[FormatOptions](#formatoptions)** parser options (optional, default `{}`) -Returns **[Promise](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Promise)\** promise for null that resolves when the stream has been written +Returns **[GFFFormattingTransformer](#gffformattingtransformer)** ## About `util` There is also a `util` module that contains super-low-level functions for dealing with lines and parts of lines. ```js -// non-ES6 -const util = require('@gmod/gff').default.util -// or, with ES6 -import gff from '@gmod/gff' -const util = gff.util +import { util } from '@gmod/gff' const gff3Lines = util.formatItem({ seq_id: 'ctgA', ... -})) +}) ``` ## util @@ -530,9 +607,10 @@ into one or more lines of GFF3. #### Parameters -- `itemOrItems` **([GFF3FeatureLineWithRefs](#gff3featurelinewithrefs) | [GFF3Directive](#gff3directive) | [GFF3Comment](#gff3comment) | [GFF3Sequence](#gff3sequence) | [Array](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array)<([GFF3FeatureLineWithRefs](#gff3featurelinewithrefs) | [GFF3Directive](#gff3directive) | [GFF3Comment](#gff3comment) | [GFF3Sequence](#gff3sequence))>)** A comment, sequence, or feature, or array of such items +- `item` **([GFF3FeatureLineWithRefs](#gff3featurelinewithrefs) | [GFF3Directive](#gff3directive) | [GFF3Comment](#gff3comment) | [GFF3Sequence](#gff3sequence))** +- `itemOrItems` A comment, sequence, or feature, or array of such items -Returns **([string](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String) | [Array](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array)<[string](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String)>)** A formatted string or array of strings +Returns **[string](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String)** A formatted string or array of strings ### GFF3Attributes diff --git a/package.json b/package.json index d41b6fe..515c706 100644 --- a/package.json +++ b/package.json @@ -14,11 +14,8 @@ "email": "rbuels@gmail.com", "url": "https://github.com/rbuels" }, - "bin": { - "gff-to-json": "dist/gff-to-json.js" - }, "engines": { - "node": ">=10" + "node": ">=16" }, "files": [ "dist", @@ -35,8 +32,8 @@ "docs:util": "documentation readme src/util.ts --section=util --shallow", "docs:format": "prettier --write README.md", "prebuild": "npm-run-all clean", - "build:esm": "tsc --target es2018 --outDir esm", - "build:es5": "tsc --target es2015 --module commonjs --outDir dist", + "build:esm": "tsc --outDir esm", + "build:cjs": "tsc --module commonjs --outDir dist", "build": "npm-run-all --parallel build:*", "prepublishOnly": "npm-run-all test build", "postversion": "git push --follow-tags" @@ -48,12 +45,9 @@ "gff", "genomics" ], - "dependencies": { - "stream-browserify": "^3.0.0" - }, "devDependencies": { "@types/jest": "^29.2.4", - "@types/node": "^10.17.60", + "@types/node": "^16.11.7", "@typescript-eslint/eslint-plugin": "^5.46.1", "@typescript-eslint/parser": "^5.46.1", "documentation": "^14.0.1", @@ -69,6 +63,7 @@ "rimraf": "^3.0.2", "tmp-promise": "^3.0.2", "ts-jest": "^29.0.3", + "ts-node": "^10.9.1", "tslib": "^2.4.1", "typescript": "^4.9.4" } diff --git a/src/api.ts b/src/api.ts index 1e99caf..c3f607b 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,29 +1,14 @@ -import { Transform, TransformCallback, Readable, Writable } from 'stream' -import { StringDecoder as Decoder } from 'string_decoder' - -import Parser from './parse' +import Parser, { ParseCallbacks } from './parse' import { formatItem, formatSequence, GFF3Comment, GFF3Directive, GFF3Feature, - GFF3FeatureLine, - GFF3FeatureLineWithRefs, GFF3Sequence, GFF3Item, } from './util' -export type { - GFF3Comment, - GFF3Directive, - GFF3Feature, - GFF3FeatureLine, - GFF3FeatureLineWithRefs, - GFF3Sequence, - GFF3Item, -} - /** Parser options */ export interface ParseOptions { /** Whether to resolve references to derives from features */ @@ -49,13 +34,6 @@ export interface ParseOptions { type ParseOptionsProcessed = Required> -// call a callback on the next process tick if running in -// an environment that supports it -function _callback(callback: TransformCallback) { - if (process && process.nextTick) process.nextTick(callback) - else callback() -} - // shared arg processing for the parse routines function _processParseOptions(options: ParseOptions): ParseOptionsProcessed { const out = { @@ -79,59 +57,94 @@ function _processParseOptions(options: ParseOptions): ParseOptionsProcessed { return out } -class GFFTransform extends Transform { - encoding: BufferEncoding - decoder: Decoder - textBuffer = '' - parser: Parser - - constructor(inputOptions: ParseOptions = {}) { - super({ objectMode: true }) - const options = _processParseOptions(inputOptions) - - this.encoding = inputOptions.encoding || 'utf8' - - this.decoder = new Decoder() +/** + * Parse a stream of text data into a stream of feature, directive, comment, + * an sequence objects. + */ +export class GFFTransformer implements Transformer { + private decoder: TextDecoder + private parser: Parser + private lastString = '' + private parseFeatures: boolean + private parseDirectives: boolean + private parseComments: boolean + private parseSequences: boolean - const push = this.push.bind(this) - this.parser = new Parser({ - featureCallback: options.parseFeatures ? push : undefined, - directiveCallback: options.parseDirectives ? push : undefined, - commentCallback: options.parseComments ? push : undefined, - sequenceCallback: options.parseSequences ? push : undefined, - errorCallback: (err) => this.emit('error', err), - bufferSize: options.bufferSize, - disableDerivesFromReferences: options.disableDerivesFromReferences, - }) + /** + * Options for how the text stream is parsed + * @param options - Parser options + */ + constructor(options: ParseOptions = {}) { + const processedOptions = _processParseOptions(options) + this.decoder = new TextDecoder(processedOptions.encoding) + const { bufferSize, disableDerivesFromReferences } = processedOptions + this.parser = new Parser({ bufferSize, disableDerivesFromReferences }) + this.parseFeatures = processedOptions.parseFeatures + this.parseDirectives = processedOptions.parseDirectives + this.parseComments = processedOptions.parseComments + this.parseSequences = processedOptions.parseSequences } - private addLine(data: string | undefined) { - if (data) { - this.parser.addLine(data) + private makeCallbacks( + controller: TransformStreamDefaultController, + ) { + const callbacks: ParseCallbacks = { + errorCallback: this.emitErrorMessage.bind(this, controller), + } + if (this.parseFeatures) { + callbacks.featureCallback = this.enqueueItem.bind(this, controller) + } + if (this.parseDirectives) { + callbacks.directiveCallback = this.enqueueItem.bind(this, controller) + } + if (this.parseComments) { + callbacks.commentCallback = this.enqueueItem.bind(this, controller) + } + if (this.parseSequences) { + callbacks.sequenceCallback = this.enqueueItem.bind(this, controller) } + return callbacks } - private nextText(buffer: string) { - const pieces = (this.textBuffer + buffer).split(/\r?\n/) - this.textBuffer = pieces.pop() || '' - - pieces.forEach((piece) => this.addLine(piece)) + private emitErrorMessage( + controller: TransformStreamDefaultController, + errorMessage: string, + ) { + controller.error(errorMessage) + } + private enqueueItem( + controller: TransformStreamDefaultController, + item: GFF3Item, + ) { + controller.enqueue(item) } - _transform( - chunk: Buffer, - _encoding: BufferEncoding, - callback: TransformCallback, + transform( + chunk: Uint8Array, + controller: TransformStreamDefaultController, ) { - this.nextText(this.decoder.write(chunk)) - _callback(callback) + // Decode the current chunk to string and prepend the last string + const string = `${this.lastString}${this.decoder.decode(chunk, { + stream: true, + })}` + // Extract lines from chunk + const lines = string.split(/\r\n|[\r\n]/g) + // Save last line, as it might be incomplete + this.lastString = lines.pop() || '' + // Enqueue each line in the next chunk + for (const line of lines) { + this.parser.addLine(line, this.makeCallbacks(controller)) + } } - _flush(callback: TransformCallback) { - if (this.decoder.end) this.nextText(this.decoder.end()) - if (this.textBuffer != null) this.addLine(this.textBuffer) - this.parser.finish() - _callback(callback) + flush(controller: TransformStreamDefaultController) { + const callbacks = this.makeCallbacks(controller) + this.lastString = `${this.lastString}${this.decoder.decode()}` + if (this.lastString) { + this.parser.addLine(this.lastString, callbacks) + this.lastString = '' + } + this.parser.finish(callbacks) } } @@ -142,8 +155,8 @@ class GFFTransform extends Transform { * @param options - Parsing options * @returns stream (in objectMode) of parsed items */ -export function parseStream(options: ParseOptions = {}): GFFTransform { - return new GFFTransform(options) +export function parseStream(options: ParseOptions = {}): GFFTransformer { + return new GFFTransformer(options) } /** @@ -482,20 +495,33 @@ export function parseStringSync( const items: GFF3Item[] = [] const push = items.push.bind(items) + const callbacks: ParseCallbacks = { + errorCallback: (err: string) => { + throw new Error(err) + }, + } + if (options.parseFeatures) { + callbacks.featureCallback = push + } + if (options.parseDirectives) { + callbacks.directiveCallback = push + } + if (options.parseComments) { + callbacks.commentCallback = push + } + if (options.parseSequences) { + callbacks.sequenceCallback = push + } + const parser = new Parser({ - featureCallback: options.parseFeatures ? push : undefined, - directiveCallback: options.parseDirectives ? push : undefined, - commentCallback: options.parseComments ? push : undefined, - sequenceCallback: options.parseSequences ? push : undefined, disableDerivesFromReferences: options.disableDerivesFromReferences || false, bufferSize: Infinity, - errorCallback: (err) => { - throw err - }, }) - str.split(/\r\n|[\r\n]/).forEach(parser.addLine.bind(parser)) - parser.finish() + str + .split(/\r\n|[\r\n]/) + .forEach((line) => parser.addLine.bind(parser)(line, callbacks)) + parser.finish(callbacks) return items } @@ -525,55 +551,71 @@ export function formatSync(items: GFF3Item[]): string { return str } -interface FormatOptions { +/** Formatter options */ +export interface FormatOptions { + /** + * The minimum number of lines to emit between sync (###) directives, default + * 100 + */ minSyncLines?: number + /** + * Whether to insert a version directive at the beginning of a formatted + * stream if one does not exist already, default true + */ insertVersionDirective?: boolean - encoding?: BufferEncoding } -class FormattingTransform extends Transform { +/** + * Transform a stream of features, directives, comments and/or sequences into a + * stream of GFF3 text. + * + * Inserts synchronization (###) marks automatically. + */ +export class GFFFormattingTransformer implements Transformer { linesSinceLastSyncMark = 0 haveWeEmittedData = false fastaMode = false minLinesBetweenSyncMarks: number insertVersionDirective: boolean + /** + * Options for how the output text stream is formatted + * @param options - Formatter options + */ constructor(options: FormatOptions = {}) { - super(Object.assign(options, { objectMode: true })) this.minLinesBetweenSyncMarks = options.minSyncLines || 100 - this.insertVersionDirective = options.insertVersionDirective || false + this.insertVersionDirective = options.insertVersionDirective || true } - _transform( - chunk: GFF3Item[], - _encoding: BufferEncoding, - callback: TransformCallback, + transform( + chunk: GFF3Item, + controller: TransformStreamDefaultController, ) { // if we have not emitted anything yet, and this first chunk is not a // gff-version directive, emit one - let str - if (!this.haveWeEmittedData && this.insertVersionDirective) { - const thisChunk = Array.isArray(chunk) ? chunk[0] : chunk - if ('directive' in thisChunk) { - if (thisChunk.directive !== 'gff-version') { - this.push('##gff-version 3\n') - } - } + let str: string + if ( + !this.haveWeEmittedData && + this.insertVersionDirective && + (!('directive' in chunk) || + ('directive' in chunk && chunk.directive !== 'gff-version')) + ) { + controller.enqueue('##gff-version 3\n') } // if it's a sequence chunk coming down, emit a FASTA directive and change // to FASTA mode if ('sequence' in chunk && !this.fastaMode) { - this.push('##FASTA\n') + controller.enqueue('##FASTA\n') this.fastaMode = true } if (Array.isArray(chunk)) str = chunk.map((c) => formatItem(c)).join('') else str = formatItem(chunk) - this.push(str) + controller.enqueue(str) if (this.linesSinceLastSyncMark >= this.minLinesBetweenSyncMarks) { - this.push('###\n') + controller.enqueue('###\n') this.linesSinceLastSyncMark = 0 } else { // count the number of newlines in this chunk @@ -585,7 +627,6 @@ class FormattingTransform extends Transform { } this.haveWeEmittedData = true - _callback(callback) } } @@ -597,37 +638,8 @@ class FormattingTransform extends Transform { * * @param options - parser options */ -export function formatStream(options: FormatOptions = {}): FormattingTransform { - return new FormattingTransform(options) -} - -/** - * Format a stream of features, directives, comments and/or sequences into a - * GFF3 file and write it to the filesystem. - - * Inserts synchronization (###) marks and a ##gff-version directive - * automatically (if one is not already present). - * - * @param stream - the stream to write to the file - * @param filename - the file path to write to - * @param options - parser options - * @returns promise for null that resolves when the stream has been written - */ -export function formatFile( - stream: Readable, - writeStream: Writable, +export function formatStream( options: FormatOptions = {}, -): Promise { - const newOptions = { - insertVersionDirective: true, - ...options, - } - - return new Promise((resolve, reject) => { - stream - .pipe(new FormattingTransform(newOptions)) - .on('end', () => resolve(null)) - .on('error', reject) - .pipe(writeStream) - }) +): GFFFormattingTransformer { + return new GFFFormattingTransformer(options) } diff --git a/src/gff-to-json.ts b/src/gff-to-json.ts deleted file mode 100644 index 51e63bb..0000000 --- a/src/gff-to-json.ts +++ /dev/null @@ -1,23 +0,0 @@ -import gff from './index' -let itemBuffer: any - -process.stdout.write('[\n') -process.stdin - .pipe(gff.parseStream({ parseAll: true })) - .on('data', (item) => { - itemBuffer = JSON.stringify(item) - if (itemBuffer) { - process.stdout.write(itemBuffer) - process.stdout.write(',\n') - } - }) - .on('error', (err) => { - console.error(err) - process.exit(1) - }) - .on('end', () => { - if (itemBuffer) { - process.stdout.write(itemBuffer) - } - process.stdout.write('\n]\n') - }) diff --git a/src/index.ts b/src/index.ts index 4a2ee37..ea069a2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,35 +1,25 @@ -import { - parseStream, - parseStringSync, - formatSync, - formatStream, - formatFile, +import * as api from './api' + +export * from './api' + +export type { + GFF3Attributes, GFF3Comment, GFF3Directive, GFF3Feature, GFF3FeatureLine, GFF3FeatureLineWithRefs, - GFF3Sequence, + GFF3GenomeBuildDirective, GFF3Item, -} from './api' + GFF3Sequence, + GFF3SequenceRegionDirective, +} from './util' import * as util from './util' -export default { - parseStream, - parseStringSync, - formatSync, - formatStream, - formatFile, +export const defaultExport = { + ...api, util, } -export type { - GFF3Comment, - GFF3Directive, - GFF3Feature, - GFF3FeatureLine, - GFF3FeatureLineWithRefs, - GFF3Sequence, - GFF3Item, -} +export default defaultExport diff --git a/src/parse.ts b/src/parse.ts index 59ca4f7..b03be50 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -5,14 +5,23 @@ const containerAttributes = { Derives_from: 'derived_features' as const, } +export interface ParseCallbacks { + featureCallback?(feature: GFF3.GFF3Feature): void + commentCallback?(comment: GFF3.GFF3Comment): void + errorCallback?(error: string): void + directiveCallback?(directive: GFF3.GFF3Directive): void + sequenceCallback?(sequence: GFF3.GFF3Sequence): void +} + export class FASTAParser { - seqCallback: (sequence: GFF3.GFF3Sequence) => void currentSequence: | { id: string; sequence: string; description?: string } | undefined - constructor(seqCallback: (sequence: GFF3.GFF3Sequence) => void) { - this.seqCallback = seqCallback + constructor( + // eslint-disable-next-line @typescript-eslint/no-empty-function + private seqCallback: (sequence: GFF3.GFF3Sequence) => void = () => {}, + ) { this.currentSequence = undefined } @@ -37,12 +46,7 @@ export class FASTAParser { } interface ParserArgs { - featureCallback?(feature: GFF3.GFF3Feature): void endCallback?(): void - commentCallback?(comment: GFF3.GFF3Comment): void - errorCallback?(error: string): void - directiveCallback?(directive: GFF3.GFF3Directive): void - sequenceCallback?(sequence: GFF3.GFF3Sequence): void bufferSize?: number disableDerivesFromReferences?: boolean } @@ -53,13 +57,8 @@ interface References { } export default class Parser { - featureCallback: (feature: GFF3.GFF3Feature) => void endCallback: () => void - commentCallback: (comment: GFF3.GFF3Comment) => void - errorCallback: (error: string) => void disableDerivesFromReferences: boolean - directiveCallback: (directive: GFF3.GFF3Directive) => void - sequenceCallback: (sequence: GFF3.GFF3Sequence) => void bufferSize: number fastaParser: FASTAParser | undefined = undefined // if this is true, the parser ignores the rest of the lines in the file. @@ -82,18 +81,14 @@ export default class Parser { // 'Derives_from' : [ orphans that have a Derives_from attr referencing it ], // } // } - private underConstructionOrphans: Record = {} + private underConstructionOrphans: Map = + new Map() constructor(args: ParserArgs) { // eslint-disable-next-line @typescript-eslint/no-empty-function const nullFunc = () => {} - this.featureCallback = args.featureCallback || nullFunc this.endCallback = args.endCallback || nullFunc - this.commentCallback = args.commentCallback || nullFunc - this.errorCallback = args.errorCallback || nullFunc - this.directiveCallback = args.directiveCallback || nullFunc - this.sequenceCallback = args.sequenceCallback || nullFunc this.disableDerivesFromReferences = args.disableDerivesFromReferences || false @@ -101,7 +96,7 @@ export default class Parser { this.bufferSize = args.bufferSize === undefined ? 1000 : args.bufferSize } - addLine(line: string): void { + addLine(line: string, callbacks: ParseCallbacks): void { // if we have transitioned to a fasta section, just delegate to that parser if (this.fastaParser) { this.fastaParser.addLine(line) @@ -116,7 +111,7 @@ export default class Parser { if (/^\s*[^#\s>]/.test(line)) { // feature line, most common case - this.bufferLine(line) + this.bufferLine(line, callbacks) return } @@ -128,52 +123,59 @@ export default class Parser { if (hashsigns.length === 3) { // sync directive, all forward-references are resolved. - this.emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures(callbacks) } else if (hashsigns.length === 2) { const directive = GFF3.parseDirective(line) if (directive) { if (directive.directive === 'FASTA') { - this.emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures(callbacks) this.eof = true - this.fastaParser = new FASTAParser(this.sequenceCallback) + this.fastaParser = new FASTAParser(callbacks.sequenceCallback) } else { - this.emitItem(directive) + this.emitItem(directive, callbacks) } } } else { contents = contents.replace(/\s*/, '') - this.emitItem({ comment: contents }) + this.emitItem({ comment: contents }, callbacks) } } else if (/^\s*$/.test(line)) { // blank line, do nothing } else if (/^\s*>/.test(line)) { // implicit beginning of a FASTA section - this.emitAllUnderConstructionFeatures() + this.emitAllUnderConstructionFeatures(callbacks) this.eof = true - this.fastaParser = new FASTAParser(this.sequenceCallback) + this.fastaParser = new FASTAParser(callbacks.sequenceCallback) this.fastaParser.addLine(line) } else { // it's a parse error const errLine = line.replace(/\r\n|[\r\n]$/g, '') - throw new Error(`GFF3 parse error. Cannot parse '${errLine}'.`) + throw new Error(`GFF3 parse error. Cannot parse '${errLine}'.`) } } - finish(): void { - this.emitAllUnderConstructionFeatures() + finish(callbacks: ParseCallbacks): void { + this.emitAllUnderConstructionFeatures(callbacks) if (this.fastaParser) this.fastaParser.finish() this.endCallback() } private emitItem( i: GFF3.GFF3Feature | GFF3.GFF3Directive | GFF3.GFF3Comment, + callbacks: ParseCallbacks, ) { - if (Array.isArray(i)) this.featureCallback(i) - else if ('directive' in i) this.directiveCallback(i) - else if ('comment' in i) this.commentCallback(i) + if (Array.isArray(i) && callbacks.featureCallback) + callbacks.featureCallback(i) + else if ('directive' in i && callbacks.directiveCallback) + callbacks.directiveCallback(i) + else if ('comment' in i && callbacks.commentCallback) + callbacks.commentCallback(i) } - private enforceBufferSizeLimit(additionalItemCount = 0) { + private enforceBufferSizeLimit( + additionalItemCount = 0, + callbacks: ParseCallbacks, + ) { const _unbufferItem = (item?: GFF3.GFF3Feature) => { if ( item && @@ -202,7 +204,7 @@ export default class Parser { ) { const item = this.underConstructionTopLevel.shift() if (item) { - this.emitItem(item) + this.emitItem(item, callbacks) _unbufferItem(item) } } @@ -212,8 +214,10 @@ export default class Parser { * return all under-construction features, called when we know there will be * no additional data to attach to them */ - private emitAllUnderConstructionFeatures() { - this.underConstructionTopLevel.forEach(this.emitItem.bind(this)) + private emitAllUnderConstructionFeatures(callbacks: ParseCallbacks) { + this.underConstructionTopLevel.forEach((f) => + this.emitItem.bind(this)(f, callbacks), + ) this.underConstructionTopLevel = [] this.underConstructionById = {} @@ -221,17 +225,17 @@ export default class Parser { // if we have any orphans hanging around still, this is a problem. die with // a parse error - if (Array.from(Object.values(this.underConstructionOrphans)).length) { + if (this.underConstructionOrphans.size) { throw new Error( - `some features reference other features that do not exist in the file (or in the same '###' scope). ${Object.keys( - this.underConstructionOrphans, + `some features reference other features that do not exist in the file (or in the same '###' scope). ${Array.from( + this.underConstructionOrphans.keys(), )}`, ) } } // do the right thing with a newly-parsed feature line - private bufferLine(line: string) { + private bufferLine(line: string, callbacks: ParseCallbacks) { const rawFeatureLine = GFF3.parseFeature(line) const featureLine: GFF3.GFF3FeatureLineWithRefs = { ...rawFeatureLine, @@ -249,7 +253,7 @@ export default class Parser { if (!ids.length && !parents.length && !derives.length) { // if it has no IDs and does not refer to anything, we can just output it - this.emitItem([featureLine]) + this.emitItem([featureLine], callbacks) return } @@ -263,6 +267,7 @@ export default class Parser { `multi-line feature "${id}" has inconsistent types: "${ featureLine.type }", "${existing[existing.length - 1].type}"`, + callbacks, ) } existing.push(featureLine) @@ -272,7 +277,7 @@ export default class Parser { // it feature = [featureLine] - this.enforceBufferSizeLimit(1) + this.enforceBufferSizeLimit(1, callbacks) if (!parents.length && !derives.length) { this.underConstructionTopLevel.push(feature) } @@ -292,7 +297,7 @@ export default class Parser { } private resolveReferencesTo(feature: GFF3.GFF3Feature, id: string) { - const references = this.underConstructionOrphans[id] + const references = this.underConstructionOrphans.get(id) // references is of the form // { // 'Parent' : [ orphans that have a Parent attr referencing this feature ], @@ -305,12 +310,28 @@ export default class Parser { feature.forEach((loc) => { loc.derived_features.push(...references.Derives_from) }) - delete this.underConstructionOrphans[id] + this.underConstructionOrphans.delete(id) } - private parseError(message: string) { + private parseError(message: string, callbacks: ParseCallbacks) { this.eof = true - this.errorCallback(`${this.lineNumber}: ${message}`) + callbacks.errorCallback?.(`${this.lineNumber}: ${message}`) + } + + // this is all a bit more awkward in javascript than it was in perl + private postSet( + obj: Record | undefined>, + slot1: string, + slot2: string, + ) { + let subObj = obj[slot1] + if (!subObj) { + subObj = {} + obj[slot1] = subObj + } + const returnVal = subObj[slot2] || false + subObj[slot2] = true + return returnVal } private resolveReferencesFrom( @@ -318,29 +339,13 @@ export default class Parser { references: { Parent: string[]; Derives_from: string[] }, ids: string[], ) { - // this is all a bit more awkward in javascript than it was in perl - function postSet( - obj: Record | undefined>, - slot1: string, - slot2: string, - ) { - let subObj = obj[slot1] - if (!subObj) { - subObj = {} - obj[slot1] = subObj - } - const returnVal = subObj[slot2] || false - subObj[slot2] = true - return returnVal - } - references.Parent.forEach((toId) => { const otherFeature = this.underConstructionById[toId] if (otherFeature) { const pname = containerAttributes.Parent if ( !ids.filter((id) => - postSet(this.completedReferences, id, `Parent,${toId}`), + this.postSet(this.completedReferences, id, `Parent,${toId}`), ).length ) { otherFeature.forEach((location) => { @@ -348,13 +353,13 @@ export default class Parser { }) } } else { - let ref = this.underConstructionOrphans[toId] + let ref = this.underConstructionOrphans.get(toId) if (!ref) { ref = { Parent: [], Derives_from: [], } - this.underConstructionOrphans[toId] = ref + this.underConstructionOrphans.set(toId, ref) } ref.Parent.push(feature) } @@ -366,7 +371,7 @@ export default class Parser { const pname = containerAttributes.Derives_from if ( !ids.filter((id) => - postSet(this.completedReferences, id, `Derives_from,${toId}`), + this.postSet(this.completedReferences, id, `Derives_from,${toId}`), ).length ) { otherFeature.forEach((location) => { @@ -374,13 +379,13 @@ export default class Parser { }) } } else { - let ref = this.underConstructionOrphans[toId] + let ref = this.underConstructionOrphans.get(toId) if (!ref) { ref = { Parent: [], Derives_from: [], } - this.underConstructionOrphans[toId] = ref + this.underConstructionOrphans.set(toId, ref) } ref.Derives_from.push(feature) } diff --git a/test/format.test.ts b/test/format.test.ts index 07fb94b..4fbf229 100644 --- a/test/format.test.ts +++ b/test/format.test.ts @@ -1,17 +1,9 @@ -import { - createReadStream, - createWriteStream, - fdatasync as fdatasyncC, - promises as fsPromises, -} from 'fs' +import fsPromises from 'fs/promises' +import { ReadableStream, WritableStream, TransformStream } from 'stream/web' import tmp from 'tmp-promise' -import { promisify } from 'util' -import getStream from 'get-stream' - import gff from '../src' - -const fdatasync = promisify(fdatasyncC) +import { FileSource, SyncFileSink } from './util' describe('GFF3 formatting', () => { ;['spec_eden', 'au9_scaffold_subset', 'hybrid1', 'hybrid2'].forEach( @@ -40,17 +32,27 @@ describe('GFF3 formatting', () => { 'utf8', ) - const resultGFF3 = await getStream( - createReadStream(require.resolve(`./data/${file}.gff3`)) - .pipe( + const stream = new ReadableStream( + new FileSource(require.resolve(`./data/${file}.gff3`)), + ) + .pipeThrough( + new TransformStream( gff.parseStream({ parseFeatures: true, parseComments: true, parseDirectives: true, }), - ) - .pipe(gff.formatStream()), - ) + ), + ) + .pipeThrough(new TransformStream(gff.formatStream())) + const chunks: string[] = [] + const reader = stream.getReader() + let result: ReadableStreamReadResult + do { + result = await reader.read() + if (result.value !== undefined) chunks.push(result.value) + } while (!result.done) + const resultGFF3 = chunks.join('') expect(resultGFF3).toEqual(expectedGFF3) }) }, @@ -60,15 +62,19 @@ describe('GFF3 formatting', () => { 'spec_eden.no_version_directive', 'au9_scaffold_subset', ].forEach((file) => { - it(`can roundtrip ${file}.gff3 with formatFile`, async () => { + it(`can roundtrip ${file}.gff3 with formatStream and insertVersionDirective`, async () => { jest.setTimeout(1000) await tmp.withFile(async (tmpFile) => { - const gff3In = createReadStream( - require.resolve(`./data/${file}.gff3`), - ).pipe(gff.parseStream({ parseAll: true })) - - await gff.formatFile(gff3In, createWriteStream(tmpFile.path)) - await fdatasync(tmpFile.fd) + await new ReadableStream( + new FileSource(require.resolve(`./data/${file}.gff3`)), + ) + .pipeThrough(new TransformStream(gff.parseStream({ parseAll: true }))) + .pipeThrough( + new TransformStream( + gff.formatStream({ insertVersionDirective: true }), + ), + ) + .pipeTo(new WritableStream(new SyncFileSink(tmpFile.fd))) const resultGFF3 = await fsPromises.readFile(tmpFile.path, 'utf8') diff --git a/test/parse.test.ts b/test/parse.test.ts index da89284..7bd6d58 100644 --- a/test/parse.test.ts +++ b/test/parse.test.ts @@ -1,4 +1,5 @@ -import { createReadStream, promises as fsPromises } from 'fs' +import fsPromises from 'fs/promises' +import { ReadableStream, TransformStream } from 'stream/web' import gff from '../src' import { formatFeature, @@ -7,6 +8,7 @@ import { GFF3Comment, GFF3Sequence, } from '../src/util' +import { FileSource } from './util' interface ReadAllResults { features: GFF3Feature[] @@ -16,43 +18,38 @@ interface ReadAllResults { all: (GFF3Feature | GFF3Comment | GFF3Directive | GFF3Sequence)[] } -function readAll( +async function readAll( filename: string, args: Record = {}, ): Promise { - return new Promise((resolve, reject) => { - const stuff: ReadAllResults = { - features: [], - comments: [], - directives: [], - sequences: [], - all: [], - } + const stuff: ReadAllResults = { + features: [], + comments: [], + directives: [], + sequences: [], + all: [], + } - // $p->max_lookback(1) - createReadStream(require.resolve(filename)) - .pipe( - gff.parseStream({ - parseFeatures: true, - parseDirectives: true, - parseComments: true, - parseSequences: true, - bufferSize: 10, - ...args, - }), - ) - .on('data', (d) => { - stuff.all.push(d) - if (d.directive) stuff.directives.push(d) - else if (d.comment) stuff.comments.push(d) - else if (d.sequence) stuff.sequences.push(d) - else stuff.features.push(d) - }) - .on('end', () => { - resolve(stuff) - }) - .on('error', reject) - }) + const stream = new ReadableStream(new FileSource(require.resolve(filename))) + const transformStream = new TransformStream( + gff.parseStream({ + parseFeatures: true, + parseDirectives: true, + parseComments: true, + parseSequences: true, + bufferSize: 10, + ...args, + }), + ) + const gffStream = stream.pipeThrough(transformStream) + for await (const value of gffStream) { + stuff.all.push(value) + if ('directive' in value) stuff.directives.push(value) + else if ('comment' in value) stuff.comments.push(value) + else if ('sequence' in value) stuff.sequences.push(value) + else stuff.features.push(value) + } + return stuff } describe('GFF3 parser', () => { @@ -83,7 +80,6 @@ describe('GFF3 parser', () => { ].forEach(([count, filename]) => { it(`can cursorily parse ${filename}`, async () => { const stuff = await readAll(`./data/${filename}`) - // $p->max_lookback(10); expect(stuff.all.length).toEqual(count) }) }) @@ -157,7 +153,7 @@ describe('GFF3 parser', () => { disableDerivesFromReferences: true, }) expect(stuff.all).toHaveLength(17697) - }) + }, 10000) // check that some files throw a parse error ;['mm9_sample_ensembl.gff3', 'Saccharomyces_cerevisiae_EF3_e64.gff3'].forEach( @@ -298,23 +294,4 @@ SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . Alias=Solyc01g0988 expect(stuff.sequences).toEqual(expectedOutput) }) }) - - it('can be written to directly', async () => { - const items: GFF3Feature[] = await new Promise((resolve, reject) => { - const i: GFF3Feature[] = [] - const stream = gff - .parseStream() - .on('data', (d) => i.push(d)) - .on('end', () => resolve(i)) - .on('error', reject) - - stream.write( - `SL2.40ch00 ITAG_eugene gene 16437 18189 . + . Alias=Solyc00g005000;ID=gene:Solyc00g005000.2;Name=Solyc00g005000.2;from_BOGAS=1;length=1753\n`, - ) - stream.end() - }) - - expect(items).toHaveLength(1) - expect(items[0][0].seq_id).toEqual('SL2.40ch00') - }) }) diff --git a/test/util.ts b/test/util.ts new file mode 100644 index 0000000..5a5d0e9 --- /dev/null +++ b/test/util.ts @@ -0,0 +1,46 @@ +import fs from 'fs' +import type { + UnderlyingByteSource, + UnderlyingSink, + ReadableByteStreamController, +} from 'stream/web' + +// In Node 18 we can use `stream.Readable.toWeb()` instead of having to make our +// own file source +export class FileSource implements UnderlyingByteSource { + type = 'bytes' as const + autoAllocateChunkSize = 1024 + fileHandeP: Promise + + constructor(filePath: string) { + this.fileHandeP = fs.promises.open(filePath) + } + + async pull(controller: ReadableByteStreamController) { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + const view = controller.byobRequest.view as ArrayBufferView + const fileHandle = await this.fileHandeP + const { bytesRead } = await fileHandle.read({ + buffer: view, + offset: view.byteOffset, + length: view.byteLength, + }) + + if (bytesRead === 0) { + await fileHandle.close() + controller.close() + } + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + controller.byobRequest.respond(bytesRead) + } +} + +export class SyncFileSink implements UnderlyingSink { + constructor(private fileDescriptor: number) {} + + write(chunk: string) { + fs.appendFileSync(this.fileDescriptor, chunk) + } +} diff --git a/tsconfig.json b/tsconfig.json index 649e0c4..bd30a82 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,8 +1,10 @@ { "include": ["src"], "compilerOptions": { + // https://github.com/microsoft/TypeScript/wiki/Node-Target-Mapping#node-16 "types": ["node", "jest"], - "lib": ["dom", "esnext"], + "lib": ["dom", "es2021"], + "target": "es2021", "declaration": true, "moduleResolution": "node", "sourceMap": true, diff --git a/yarn.lock b/yarn.lock index 382b453..ab35213 100644 --- a/yarn.lock +++ b/yarn.lock @@ -484,6 +484,13 @@ resolved "https://registry.yarnpkg.com/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz#75a2e8b51cb758a7553d6804a5932d7aace75c39" integrity sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw== +"@cspotcode/source-map-support@^0.8.0": + version "0.8.1" + resolved "https://registry.yarnpkg.com/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz#00629c35a688e05a88b1cda684fb9d5e73f000a1" + integrity sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw== + dependencies: + "@jridgewell/trace-mapping" "0.3.9" + "@eslint/eslintrc@^1.4.0": version "1.4.0" resolved "https://registry.yarnpkg.com/@eslint/eslintrc/-/eslintrc-1.4.0.tgz#8ec64e0df3e7a1971ee1ff5158da87389f167a63" @@ -760,6 +767,14 @@ resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.11.tgz#771a1d8d744eeb71b6adb35808e1a6c7b9b8c8ec" integrity sha512-Fg32GrJo61m+VqYSdRSjRXMjQ06j8YIYfcTqndLYVAaHmroZHLJZCydsWBOTDqXS2v+mjxohBWEMfg97GXmYQg== +"@jridgewell/trace-mapping@0.3.9": + version "0.3.9" + resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz#6534fd5933a53ba7cbf3a17615e273a0d1273ff9" + integrity sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ== + dependencies: + "@jridgewell/resolve-uri" "^3.0.3" + "@jridgewell/sourcemap-codec" "^1.4.10" + "@jridgewell/trace-mapping@^0.3.0": version "0.3.4" resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.4.tgz#f6a0832dffd5b8a6aaa633b7d9f8e8e94c83a0c3" @@ -831,6 +846,26 @@ dependencies: "@sinonjs/commons" "^1.7.0" +"@tsconfig/node10@^1.0.7": + version "1.0.9" + resolved "https://registry.yarnpkg.com/@tsconfig/node10/-/node10-1.0.9.tgz#df4907fc07a886922637b15e02d4cebc4c0021b2" + integrity sha512-jNsYVVxU8v5g43Erja32laIDHXeoNvFEpX33OK4d6hljo3jDhCBDhx5dhCCTMWUojscpAagGiRkBKxpdl9fxqA== + +"@tsconfig/node12@^1.0.7": + version "1.0.11" + resolved "https://registry.yarnpkg.com/@tsconfig/node12/-/node12-1.0.11.tgz#ee3def1f27d9ed66dac6e46a295cffb0152e058d" + integrity sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag== + +"@tsconfig/node14@^1.0.0": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@tsconfig/node14/-/node14-1.0.3.tgz#e4386316284f00b98435bf40f72f75a09dabf6c1" + integrity sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow== + +"@tsconfig/node16@^1.0.2": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@tsconfig/node16/-/node16-1.0.3.tgz#472eaab5f15c1ffdd7f8628bd4c4f753995ec79e" + integrity sha512-yOlFc+7UtL/89t2ZhjPvvB/DeAr3r+Dq58IgzsFkOAvVC6NMJXmCGjbptdXdR9qsX7pKcTL+s87FtYREi2dEEQ== + "@types/babel__core@^7.1.14": version "7.1.19" resolved "https://registry.yarnpkg.com/@types/babel__core/-/babel__core-7.1.19.tgz#7b497495b7d1b4812bdb9d02804d0576f43ee460" @@ -949,10 +984,10 @@ resolved "https://registry.yarnpkg.com/@types/node/-/node-17.0.23.tgz#3b41a6e643589ac6442bdbd7a4a3ded62f33f7da" integrity sha512-UxDxWn7dl97rKVeVS61vErvw086aCYhDLyvRQZ5Rk65rZKepaFdm53GeqXaKBuOhED4e9uWq34IC3TdSdJJ2Gw== -"@types/node@^10.17.60": - version "10.17.60" - resolved "https://registry.yarnpkg.com/@types/node/-/node-10.17.60.tgz#35f3d6213daed95da7f0f73e75bcc6980e90597b" - integrity sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw== +"@types/node@^16.11.7": + version "16.18.22" + resolved "https://registry.yarnpkg.com/@types/node/-/node-16.18.22.tgz#a6505a5da1387aae03fddfb93591118f27b4c0ea" + integrity sha512-LJSIirgASa1LicFGTUFwDY7BfKDtLIbijqDLkH47LxEo/jtdrtiZ4/kLPD99bEQhTcPcuh6KhDllHqRxygJD2w== "@types/normalize-package-data@^2.4.1": version "2.4.1" @@ -1142,6 +1177,16 @@ acorn-jsx@^5.3.2: resolved "https://registry.yarnpkg.com/acorn-jsx/-/acorn-jsx-5.3.2.tgz#7ed5bb55908b3b2f1bc55c6af1653bada7f07937" integrity sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ== +acorn-walk@^8.1.1: + version "8.2.0" + resolved "https://registry.yarnpkg.com/acorn-walk/-/acorn-walk-8.2.0.tgz#741210f2e2426454508853a2f44d0ab83b7f69c1" + integrity sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA== + +acorn@^8.4.1: + version "8.8.2" + resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.8.2.tgz#1b2f25db02af965399b9776b0c2c391276d37c4a" + integrity sha512-xjIYgE8HBrkpd/sJqOGNspf8uHG+NOHGOw6a/Urj8taM2EXfdNAH2oFcPeIFfsv3+kz/mJrS5VuMqbNLjCa2vw== + acorn@^8.8.0: version "8.8.1" resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.8.1.tgz#0a3f9cbecc4ec3bea6f0a80b66ae8dd2da250b73" @@ -1201,6 +1246,11 @@ anymatch@^3.0.3, anymatch@~3.1.2: normalize-path "^3.0.0" picomatch "^2.0.4" +arg@^4.1.0: + version "4.1.3" + resolved "https://registry.yarnpkg.com/arg/-/arg-4.1.3.tgz#269fc7ad5b8e42cb63c896d5666017261c144089" + integrity sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA== + argparse@^1.0.7: version "1.0.10" resolved "https://registry.yarnpkg.com/argparse/-/argparse-1.0.10.tgz#bcd6791ea5ae09725e17e5ad988134cd40b3d911" @@ -1550,6 +1600,11 @@ convert-source-map@^2.0.0: resolved "https://registry.yarnpkg.com/convert-source-map/-/convert-source-map-2.0.0.tgz#4b560f649fc4e918dd0ab75cf4961e8bc882d82a" integrity sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg== +create-require@^1.1.0: + version "1.1.1" + resolved "https://registry.yarnpkg.com/create-require/-/create-require-1.1.1.tgz#c1d7e8f1e5f6cfc9ff65f9cd352d37348756c333" + integrity sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ== + cross-spawn@^6.0.5: version "6.0.5" resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4" @@ -1648,6 +1703,11 @@ diff-sequences@^29.3.1: resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.3.1.tgz#104b5b95fe725932421a9c6e5b4bef84c3f2249e" integrity sha512-hlM3QR272NXCi4pq+N4Kok4kOp6EsgOM3ZSpJI7Da3UAs+Ttsi8MRmB6trM/lhyzUxGfOgnpkHtgqm5Q/CTcfQ== +diff@^4.0.1: + version "4.0.2" + resolved "https://registry.yarnpkg.com/diff/-/diff-4.0.2.tgz#60f3aecb89d5fae520c11aa19efc2bb982aade7d" + integrity sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A== + diff@^5.0.0, diff@^5.1.0: version "5.1.0" resolved "https://registry.yarnpkg.com/diff/-/diff-5.1.0.tgz#bc52d298c5ea8df9194800224445ed43ffc87e40" @@ -2514,7 +2574,7 @@ inflight@^1.0.4: once "^1.3.0" wrappy "1" -inherits@2, inherits@^2.0.3, inherits@~2.0.4: +inherits@2: version "2.0.4" resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== @@ -3313,7 +3373,7 @@ make-dir@^3.0.0: dependencies: semver "^6.0.0" -make-error@1.x: +make-error@1.x, make-error@^1.1.1: version "1.3.6" resolved "https://registry.yarnpkg.com/make-error/-/make-error-1.3.6.tgz#2eb2e37ea9b67c4891f684a1394799af484cf7a2" integrity sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw== @@ -4273,15 +4333,6 @@ read-pkg@^7.1.0: parse-json "^5.2.0" type-fest "^2.0.0" -readable-stream@^3.5.0: - version "3.6.0" - resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198" - integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA== - dependencies: - inherits "^2.0.3" - string_decoder "^1.1.1" - util-deprecate "^1.0.1" - readdirp@~3.6.0: version "3.6.0" resolved "https://registry.yarnpkg.com/readdirp/-/readdirp-3.6.0.tgz#74a370bd857116e245b29cc97340cd431a02a6c7" @@ -4454,11 +4505,6 @@ safe-buffer@~5.1.1: resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.1.2.tgz#991ec69d296e0313747d59bdfd2b745c35f8828d" integrity sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g== -safe-buffer@~5.2.0: - version "5.2.1" - resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" - integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== - safe-regex-test@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/safe-regex-test/-/safe-regex-test-1.0.0.tgz#793b874d524eb3640d1873aad03596db2d4f2295" @@ -4616,14 +4662,6 @@ stack-utils@^2.0.3: dependencies: escape-string-regexp "^2.0.0" -stream-browserify@^3.0.0: - version "3.0.0" - resolved "https://registry.yarnpkg.com/stream-browserify/-/stream-browserify-3.0.0.tgz#22b0a2850cdf6503e73085da1fc7b7d0c2122f2f" - integrity sha512-H73RAHsVBapbim0tU2JwwOiXUj+fikfiaoYAKHF3VJfA0pe2BCzkhAHBlLG6REzE+2WNZcxOXjK7lkso+9euLA== - dependencies: - inherits "~2.0.4" - readable-stream "^3.5.0" - string-length@^4.0.1: version "4.0.2" resolved "https://registry.yarnpkg.com/string-length/-/string-length-4.0.2.tgz#a8a8dc7bd5c1a82b9b3c8b87e125f66871b6e57a" @@ -4693,13 +4731,6 @@ string.prototype.trimstart@^1.0.6: define-properties "^1.1.4" es-abstract "^1.20.4" -string_decoder@^1.1.1: - version "1.3.0" - resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e" - integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== - dependencies: - safe-buffer "~5.2.0" - stringify-entities@^4.0.2: version "4.0.3" resolved "https://registry.yarnpkg.com/stringify-entities/-/stringify-entities-4.0.3.tgz#cfabd7039d22ad30f3cc435b0ca2c1574fc88ef8" @@ -4847,6 +4878,25 @@ ts-jest@^29.0.3: semver "7.x" yargs-parser "^21.0.1" +ts-node@^10.9.1: + version "10.9.1" + resolved "https://registry.yarnpkg.com/ts-node/-/ts-node-10.9.1.tgz#e73de9102958af9e1f0b168a6ff320e25adcff4b" + integrity sha512-NtVysVPkxxrwFGUUxGYhfux8k78pQB3JqYBXlLRZgdGUqTO5wU/UyHop5p70iEbGhB7q5KmiZiU0Y3KlJrScEw== + dependencies: + "@cspotcode/source-map-support" "^0.8.0" + "@tsconfig/node10" "^1.0.7" + "@tsconfig/node12" "^1.0.7" + "@tsconfig/node14" "^1.0.0" + "@tsconfig/node16" "^1.0.2" + acorn "^8.4.1" + acorn-walk "^8.1.1" + arg "^4.1.0" + create-require "^1.1.0" + diff "^4.0.1" + make-error "^1.1.1" + v8-compile-cache-lib "^3.0.1" + yn "3.1.1" + tsconfig-paths@^3.14.1: version "3.14.1" resolved "https://registry.yarnpkg.com/tsconfig-paths/-/tsconfig-paths-3.14.1.tgz#ba0734599e8ea36c862798e920bcf163277b137a" @@ -5024,11 +5074,6 @@ uri-js@^4.2.2: dependencies: punycode "^2.1.0" -util-deprecate@^1.0.1: - version "1.0.2" - resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" - integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8= - uvu@^0.5.0: version "0.5.6" resolved "https://registry.yarnpkg.com/uvu/-/uvu-0.5.6.tgz#2754ca20bcb0bb59b64e9985e84d2e81058502df" @@ -5039,6 +5084,11 @@ uvu@^0.5.0: kleur "^4.0.3" sade "^1.7.3" +v8-compile-cache-lib@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz#6336e8d71965cb3d35a1bbb7868445a7c05264bf" + integrity sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg== + v8-to-istanbul@^9.0.1: version "9.0.1" resolved "https://registry.yarnpkg.com/v8-to-istanbul/-/v8-to-istanbul-9.0.1.tgz#b6f994b0b5d4ef255e17a0d17dc444a9f5132fa4" @@ -5195,6 +5245,11 @@ yargs@^17.3.1, yargs@^17.5.1: y18n "^5.0.5" yargs-parser "^21.1.1" +yn@3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/yn/-/yn-3.1.1.tgz#1e87401a09d767c1d5eab26a6e4c185182d2eb50" + integrity sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q== + yocto-queue@^0.1.0: version "0.1.0" resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-0.1.0.tgz#0294eb3dee05028d31ee1a5fa2c556a6aaf10a1b" From 0e5bad1054b44c6994e0644e77dce5ead1f26128 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Mon, 3 Apr 2023 15:50:51 -0600 Subject: [PATCH 08/17] Remove encoding and parseAll options --- README.md | 27 +++++++++------------------ src/api.ts | 44 ++------------------------------------------ test/format.test.ts | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index df93468..b6821cb 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,9 @@ import { ;(async () => { const readStream = createReadStream('/path/to/my/file.gff3') const streamOfGFF3 = Readable.toWeb(readStream).pipeThrough( - new TransformStream(new GFFTransformer({ parseAll: true })), + new TransformStream( + new GFFTransformer({ parseComments: true, parseDirectives: true }), + ), ) for await (const data of streamOfGFF3) { if ('directive' in data) { @@ -78,7 +80,11 @@ import { // read a file, format it, and write it to a new file. inserts sync marks and // a '##gff-version 3' header if one is not already present await Readable.toWeb(createReadStream('/path/to/my/file.gff3')) - .pipeThrough(new TransformStream(new GFFTransformer({ parseAll: true }))) + .pipeThrough( + new TransformStream( + new GFFTransformer({ parseComments: true, parseDirectives: true }), + ), + ) .pipeThrough(new TransformStream(new GFFFormattingTransformer())) .pipeTo(Writable.toWeb(createWriteStream('/path/to/my/file.gff3'))) })() @@ -250,12 +256,10 @@ ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA`)[ - [ParseOptions](#parseoptions) - [disableDerivesFromReferences](#disablederivesfromreferences) - - [encoding](#encoding) - [parseFeatures](#parsefeatures) - [parseDirectives](#parsedirectives) - [parseComments](#parsecomments) - [parseSequences](#parsesequences) - - [parseAll](#parseall) - [bufferSize](#buffersize) - [GFFTransformer](#gfftransformer) - [Parameters](#parameters) @@ -283,12 +287,6 @@ Whether to resolve references to derives from features Type: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean) -#### encoding - -Text encoding of the input GFF3. default 'utf8' - -Type: BufferEncoding - #### parseFeatures Whether to parse features, default true @@ -313,13 +311,6 @@ Whether to parse sequences, default true Type: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean) -#### parseAll - -Parse all features, directives, comments, and sequences. Overrides other -parsing options. Default false. - -Type: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean) - #### bufferSize Maximum number of GFF3 lines to buffer, default 1000 @@ -354,7 +345,7 @@ parsed items. #### Parameters - `str` **[string](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String)** GFF3 string -- `inputOptions` **({disableDerivesFromReferences: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean)?, encoding: BufferEncoding?, bufferSize: [number](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number)?} | [undefined](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/undefined))?** Parsing options +- `inputOptions` **({disableDerivesFromReferences: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean)?, bufferSize: [number](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number)?} | [undefined](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/undefined))?** Parsing options Returns **[Array](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array)<(GFF3Feature | GFF3Sequence)>** array of parsed features, directives, comments and/or sequences diff --git a/src/api.ts b/src/api.ts index c3f607b..2e1f525 100644 --- a/src/api.ts +++ b/src/api.ts @@ -13,8 +13,6 @@ import { export interface ParseOptions { /** Whether to resolve references to derives from features */ disableDerivesFromReferences?: boolean - /** Text encoding of the input GFF3. default 'utf8' */ - encoding?: BufferEncoding /** Whether to parse features, default true */ parseFeatures?: boolean /** Whether to parse directives, default false */ @@ -23,21 +21,15 @@ export interface ParseOptions { parseComments?: boolean /** Whether to parse sequences, default true */ parseSequences?: boolean - /** - * Parse all features, directives, comments, and sequences. Overrides other - * parsing options. Default false. - */ - parseAll?: boolean /** Maximum number of GFF3 lines to buffer, default 1000 */ bufferSize?: number } -type ParseOptionsProcessed = Required> +type ParseOptionsProcessed = Required // shared arg processing for the parse routines function _processParseOptions(options: ParseOptions): ParseOptionsProcessed { const out = { - encoding: 'utf8' as const, parseFeatures: true, parseDirectives: false, parseSequences: true, @@ -47,13 +39,6 @@ function _processParseOptions(options: ParseOptions): ParseOptionsProcessed { ...options, } - if (options.parseAll) { - out.parseFeatures = true - out.parseDirectives = true - out.parseComments = true - out.parseSequences = true - } - return out } @@ -75,8 +60,8 @@ export class GFFTransformer implements Transformer { * @param options - Parser options */ constructor(options: ParseOptions = {}) { + this.decoder = new TextDecoder() const processedOptions = _processParseOptions(options) - this.decoder = new TextDecoder(processedOptions.encoding) const { bufferSize, disableDerivesFromReferences } = processedOptions this.parser = new Parser({ bufferSize, disableDerivesFromReferences }) this.parseFeatures = processedOptions.parseFeatures @@ -172,26 +157,15 @@ export function parseStringSync( inputOptions?: | { disableDerivesFromReferences?: boolean - encoding?: BufferEncoding bufferSize?: number } | undefined, ): (GFF3Feature | GFF3Sequence)[] -export function parseStringSync( - str: string, - inputOptions: { - parseAll?: T - disableDerivesFromReferences?: boolean - encoding?: BufferEncoding - bufferSize?: number - }, -): T extends true ? GFF3Item[] : never export function parseStringSync( str: string, inputOptions: { disableDerivesFromReferences?: boolean parseFeatures: F - encoding?: BufferEncoding bufferSize?: number }, ): F extends true ? (GFF3Feature | GFF3Sequence)[] : GFF3Sequence[] @@ -200,7 +174,6 @@ export function parseStringSync( inputOptions: { disableDerivesFromReferences?: boolean parseDirectives: D - encoding?: BufferEncoding bufferSize?: number }, ): D extends true @@ -210,7 +183,6 @@ export function parseStringSync( str: string, inputOptions: { parseComments: C - encoding?: BufferEncoding bufferSize?: number }, ): C extends true @@ -221,7 +193,6 @@ export function parseStringSync( inputOptions: { disableDerivesFromReferences?: boolean parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): S extends true ? (GFF3Feature | GFF3Sequence)[] : GFF3Feature[] @@ -231,7 +202,6 @@ export function parseStringSync( disableDerivesFromReferences?: boolean parseFeatures: F parseDirectives: D - encoding?: BufferEncoding bufferSize?: number }, ): F extends true @@ -247,7 +217,6 @@ export function parseStringSync( disableDerivesFromReferences?: boolean parseFeatures: F parseComments: C - encoding?: BufferEncoding bufferSize?: number }, ): F extends true @@ -263,7 +232,6 @@ export function parseStringSync( disableDerivesFromReferences?: boolean parseFeatures: F parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): F extends true @@ -279,7 +247,6 @@ export function parseStringSync( disableDerivesFromReferences?: boolean parseDirectives: D parseComments: C - encoding?: BufferEncoding bufferSize?: number }, ): D extends true @@ -295,7 +262,6 @@ export function parseStringSync( disableDerivesFromReferences?: boolean parseDirectives: D parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): D extends true @@ -311,7 +277,6 @@ export function parseStringSync( disableDerivesFromReferences?: boolean parseComments: C parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): C extends true @@ -332,7 +297,6 @@ export function parseStringSync< parseFeatures: F parseDirectives: D parseComments: C - encoding?: BufferEncoding bufferSize?: number }, ): F extends true @@ -361,7 +325,6 @@ export function parseStringSync< parseFeatures: F parseDirectives: D parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): F extends true @@ -390,7 +353,6 @@ export function parseStringSync< parseFeatures: F parseComments: C parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): F extends true @@ -419,7 +381,6 @@ export function parseStringSync< parseFeatures: D parseComments: C parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): D extends true @@ -450,7 +411,6 @@ export function parseStringSync< parseDirectives: D parseComments: C parseSequences: S - encoding?: BufferEncoding bufferSize?: number }, ): F extends true diff --git a/test/format.test.ts b/test/format.test.ts index 4fbf229..8fa4094 100644 --- a/test/format.test.ts +++ b/test/format.test.ts @@ -21,7 +21,10 @@ describe('GFF3 formatting', () => { ) ).replace(/###\n/g, '') // formatSync does not insert sync marks - const items = gff.parseStringSync(inputGFF3, { parseAll: true }) + const items = gff.parseStringSync(inputGFF3, { + parseComments: true, + parseDirectives: true, + }) const resultGFF3 = gff.formatSync(items) expect(resultGFF3).toEqual(expectedGFF3) }) @@ -68,7 +71,14 @@ describe('GFF3 formatting', () => { await new ReadableStream( new FileSource(require.resolve(`./data/${file}.gff3`)), ) - .pipeThrough(new TransformStream(gff.parseStream({ parseAll: true }))) + .pipeThrough( + new TransformStream( + gff.parseStream({ + parseComments: true, + parseDirectives: true, + }), + ), + ) .pipeThrough( new TransformStream( gff.formatStream({ insertVersionDirective: true }), From 088649e6daef4ac9517ba72e881d3b2fa94e30bf Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Mon, 3 Apr 2023 15:53:18 -0600 Subject: [PATCH 09/17] Raise default bufferSize from 1000 to 50000 --- README.md | 2 +- src/api.ts | 4 ++-- src/parse.ts | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b6821cb..ce7c405 100644 --- a/README.md +++ b/README.md @@ -313,7 +313,7 @@ Type: [boolean](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Glob #### bufferSize -Maximum number of GFF3 lines to buffer, default 1000 +Maximum number of GFF3 lines to buffer, default 50000 Type: [number](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number) diff --git a/src/api.ts b/src/api.ts index 2e1f525..50abe01 100644 --- a/src/api.ts +++ b/src/api.ts @@ -21,7 +21,7 @@ export interface ParseOptions { parseComments?: boolean /** Whether to parse sequences, default true */ parseSequences?: boolean - /** Maximum number of GFF3 lines to buffer, default 1000 */ + /** Maximum number of GFF3 lines to buffer, default 50000 */ bufferSize?: number } @@ -34,7 +34,7 @@ function _processParseOptions(options: ParseOptions): ParseOptionsProcessed { parseDirectives: false, parseSequences: true, parseComments: false, - bufferSize: 1000, + bufferSize: 50000, disableDerivesFromReferences: false, ...options, } diff --git a/src/parse.ts b/src/parse.ts index b03be50..0c790fc 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -93,7 +93,7 @@ export default class Parser { args.disableDerivesFromReferences || false // number of lines to buffer - this.bufferSize = args.bufferSize === undefined ? 1000 : args.bufferSize + this.bufferSize = args.bufferSize === undefined ? 50000 : args.bufferSize } addLine(line: string, callbacks: ParseCallbacks): void { From 7f08375da97c201734d00f6791233665ce84109c Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Mon, 3 Apr 2023 16:01:33 -0600 Subject: [PATCH 10/17] Remove wrapper functions --- README.md | 34 +++------------------------------- src/api.ts | 25 ------------------------- test/format.test.ts | 19 ++++++++++++------- test/parse.test.ts | 4 ++-- 4 files changed, 17 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index ce7c405..ef3223b 100644 --- a/README.md +++ b/README.md @@ -263,19 +263,15 @@ ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA`)[ - [bufferSize](#buffersize) - [GFFTransformer](#gfftransformer) - [Parameters](#parameters) -- [parseStream](#parsestream) - - [Parameters](#parameters-1) - [parseStringSync](#parsestringsync) - - [Parameters](#parameters-2) + - [Parameters](#parameters-1) - [formatSync](#formatsync) - - [Parameters](#parameters-3) + - [Parameters](#parameters-2) - [FormatOptions](#formatoptions) - [minSyncLines](#minsynclines) - [insertVersionDirective](#insertversiondirective) - [GFFFormattingTransformer](#gffformattingtransformer) - - [Parameters](#parameters-4) -- [formatStream](#formatstream) - - [Parameters](#parameters-5) + - [Parameters](#parameters-3) ### ParseOptions @@ -326,17 +322,6 @@ an sequence objects. - `options` **[ParseOptions](#parseoptions)** Parser options (optional, default `{}`) -### parseStream - -Parse a stream of text data into a stream of feature, directive, comment, -an sequence objects. - -#### Parameters - -- `options` **[ParseOptions](#parseoptions)** Parsing options (optional, default `{}`) - -Returns **[GFFTransformer](#gfftransformer)** stream (in objectMode) of parsed items - ### parseStringSync Synchronously parse a string containing GFF3 and return an array of the @@ -389,19 +374,6 @@ Inserts synchronization (###) marks automatically. - `options` **[FormatOptions](#formatoptions)** Formatter options (optional, default `{}`) -### formatStream - -Format a stream of features, directives, comments and/or sequences into a -stream of GFF3 text. - -Inserts synchronization (###) marks automatically. - -#### Parameters - -- `options` **[FormatOptions](#formatoptions)** parser options (optional, default `{}`) - -Returns **[GFFFormattingTransformer](#gffformattingtransformer)** - ## About `util` There is also a `util` module that contains super-low-level functions for dealing with lines and parts of lines. diff --git a/src/api.ts b/src/api.ts index 50abe01..80a3df4 100644 --- a/src/api.ts +++ b/src/api.ts @@ -133,17 +133,6 @@ export class GFFTransformer implements Transformer { } } -/** - * Parse a stream of text data into a stream of feature, directive, comment, - * an sequence objects. - * - * @param options - Parsing options - * @returns stream (in objectMode) of parsed items - */ -export function parseStream(options: ParseOptions = {}): GFFTransformer { - return new GFFTransformer(options) -} - /** * Synchronously parse a string containing GFF3 and return an array of the * parsed items. @@ -589,17 +578,3 @@ export class GFFFormattingTransformer implements Transformer { this.haveWeEmittedData = true } } - -/** - * Format a stream of features, directives, comments and/or sequences into a - * stream of GFF3 text. - * - * Inserts synchronization (###) marks automatically. - * - * @param options - parser options - */ -export function formatStream( - options: FormatOptions = {}, -): GFFFormattingTransformer { - return new GFFFormattingTransformer(options) -} diff --git a/test/format.test.ts b/test/format.test.ts index 8fa4094..59bd0b7 100644 --- a/test/format.test.ts +++ b/test/format.test.ts @@ -2,7 +2,12 @@ import fsPromises from 'fs/promises' import { ReadableStream, WritableStream, TransformStream } from 'stream/web' import tmp from 'tmp-promise' -import gff from '../src' +import { + GFFFormattingTransformer, + GFFTransformer, + formatSync, + parseStringSync, +} from '../src' import { FileSource, SyncFileSink } from './util' describe('GFF3 formatting', () => { @@ -21,11 +26,11 @@ describe('GFF3 formatting', () => { ) ).replace(/###\n/g, '') // formatSync does not insert sync marks - const items = gff.parseStringSync(inputGFF3, { + const items = parseStringSync(inputGFF3, { parseComments: true, parseDirectives: true, }) - const resultGFF3 = gff.formatSync(items) + const resultGFF3 = formatSync(items) expect(resultGFF3).toEqual(expectedGFF3) }) @@ -40,14 +45,14 @@ describe('GFF3 formatting', () => { ) .pipeThrough( new TransformStream( - gff.parseStream({ + new GFFTransformer({ parseFeatures: true, parseComments: true, parseDirectives: true, }), ), ) - .pipeThrough(new TransformStream(gff.formatStream())) + .pipeThrough(new TransformStream(new GFFFormattingTransformer())) const chunks: string[] = [] const reader = stream.getReader() let result: ReadableStreamReadResult @@ -73,7 +78,7 @@ describe('GFF3 formatting', () => { ) .pipeThrough( new TransformStream( - gff.parseStream({ + new GFFTransformer({ parseComments: true, parseDirectives: true, }), @@ -81,7 +86,7 @@ describe('GFF3 formatting', () => { ) .pipeThrough( new TransformStream( - gff.formatStream({ insertVersionDirective: true }), + new GFFFormattingTransformer({ insertVersionDirective: true }), ), ) .pipeTo(new WritableStream(new SyncFileSink(tmpFile.fd))) diff --git a/test/parse.test.ts b/test/parse.test.ts index 7bd6d58..d1b373c 100644 --- a/test/parse.test.ts +++ b/test/parse.test.ts @@ -1,6 +1,6 @@ import fsPromises from 'fs/promises' import { ReadableStream, TransformStream } from 'stream/web' -import gff from '../src' +import gff, { GFFTransformer } from '../src' import { formatFeature, GFF3Feature, @@ -32,7 +32,7 @@ async function readAll( const stream = new ReadableStream(new FileSource(require.resolve(filename))) const transformStream = new TransformStream( - gff.parseStream({ + new GFFTransformer({ parseFeatures: true, parseDirectives: true, parseComments: true, From 4ddd64c0bc8fba37be6987a94bb6f663c664d6a3 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Mon, 10 Apr 2023 16:30:01 -0600 Subject: [PATCH 11/17] Colocate tests --- jest.config.js | 5 +++-- test/format.test.ts => src/api.test.ts | 16 +++++++------- {test => src}/parse.test.ts | 30 +++++++++++++------------- test/utils.test.ts => src/util.test.ts | 4 ++-- 4 files changed, 28 insertions(+), 27 deletions(-) rename test/format.test.ts => src/api.test.ts (85%) rename {test => src}/parse.test.ts (88%) rename test/utils.test.ts => src/util.test.ts (97%) diff --git a/jest.config.js b/jest.config.js index 8cbf894..982170c 100644 --- a/jest.config.js +++ b/jest.config.js @@ -1,5 +1,6 @@ -/** @type {import('ts-jest/dist/types').InitialOptionsTsJest} */ +/** @type {import('ts-jest/dist/types').JestConfigWithTsJest} */ module.exports = { preset: 'ts-jest', testEnvironment: 'node', -}; \ No newline at end of file + collectCoverageFrom: ['**/src/*.ts'], +} diff --git a/test/format.test.ts b/src/api.test.ts similarity index 85% rename from test/format.test.ts rename to src/api.test.ts index 59bd0b7..541ee05 100644 --- a/test/format.test.ts +++ b/src/api.test.ts @@ -7,21 +7,21 @@ import { GFFTransformer, formatSync, parseStringSync, -} from '../src' -import { FileSource, SyncFileSink } from './util' +} from '.' +import { FileSource, SyncFileSink } from '../test/util' describe('GFF3 formatting', () => { ;['spec_eden', 'au9_scaffold_subset', 'hybrid1', 'hybrid2'].forEach( (file) => { it(`can roundtrip ${file}.gff3 with formatSync`, async () => { const inputGFF3 = await fsPromises.readFile( - require.resolve(`./data/${file}.gff3`), + require.resolve(`../test/data/${file}.gff3`), 'utf8', ) const expectedGFF3 = ( await fsPromises.readFile( - require.resolve(`./data/${file}.reformatted.gff3`), + require.resolve(`../test/data/${file}.reformatted.gff3`), 'utf8', ) ).replace(/###\n/g, '') // formatSync does not insert sync marks @@ -36,12 +36,12 @@ describe('GFF3 formatting', () => { it(`can roundtrip ${file}.gff3 with formatStream`, async () => { const expectedGFF3 = await fsPromises.readFile( - require.resolve(`./data/${file}.reformatted.gff3`), + require.resolve(`../test/data/${file}.reformatted.gff3`), 'utf8', ) const stream = new ReadableStream( - new FileSource(require.resolve(`./data/${file}.gff3`)), + new FileSource(require.resolve(`../test/data/${file}.gff3`)), ) .pipeThrough( new TransformStream( @@ -74,7 +74,7 @@ describe('GFF3 formatting', () => { jest.setTimeout(1000) await tmp.withFile(async (tmpFile) => { await new ReadableStream( - new FileSource(require.resolve(`./data/${file}.gff3`)), + new FileSource(require.resolve(`../test/data/${file}.gff3`)), ) .pipeThrough( new TransformStream( @@ -94,7 +94,7 @@ describe('GFF3 formatting', () => { const resultGFF3 = await fsPromises.readFile(tmpFile.path, 'utf8') const expectedGFF3 = await fsPromises.readFile( - require.resolve(`./data/${file}.reformatted.gff3`), + require.resolve(`../test/data/${file}.reformatted.gff3`), 'utf8', ) diff --git a/test/parse.test.ts b/src/parse.test.ts similarity index 88% rename from test/parse.test.ts rename to src/parse.test.ts index d1b373c..eaa35a5 100644 --- a/test/parse.test.ts +++ b/src/parse.test.ts @@ -8,7 +8,7 @@ import { GFF3Comment, GFF3Sequence, } from '../src/util' -import { FileSource } from './util' +import { FileSource } from '../test/util' interface ReadAllResults { features: GFF3Feature[] @@ -54,10 +54,10 @@ async function readAll( describe('GFF3 parser', () => { it('can parse gff3_with_syncs.gff3', async () => { - const stuff = await readAll('./data/gff3_with_syncs.gff3') + const stuff = await readAll('../test/data/gff3_with_syncs.gff3') const referenceResult = JSON.parse( await fsPromises.readFile( - require.resolve('./data/gff3_with_syncs.result.json'), + require.resolve('../test/data/gff3_with_syncs.result.json'), 'utf8', ), ) @@ -79,17 +79,17 @@ describe('GFF3 parser', () => { [8, 'quantitative.gff3'], ].forEach(([count, filename]) => { it(`can cursorily parse ${filename}`, async () => { - const stuff = await readAll(`./data/${filename}`) + const stuff = await readAll(`../test/data/${filename}`) expect(stuff.all.length).toEqual(count) }) }) it('supports children before parents, and Derives_from', async () => { - const stuff = await readAll('./data/knownGene_out_of_order.gff3') + const stuff = await readAll('../test/data/knownGene_out_of_order.gff3') // $p->max_lookback(2); const expectedOutput = JSON.parse( await fsPromises.readFile( - require.resolve('./data/knownGene_out_of_order.result.json'), + require.resolve('../test/data/knownGene_out_of_order.result.json'), 'utf8', ), ) @@ -97,7 +97,7 @@ describe('GFF3 parser', () => { }) it('can parse the EDEN gene from the gff3 spec', async () => { - const stuff = await readAll('./data/spec_eden.gff3') + const stuff = await readAll('../test/data/spec_eden.gff3') expect(stuff.all[2]).toHaveLength(1) const [eden] = stuff.all[2] as GFF3Feature @@ -130,7 +130,7 @@ describe('GFF3 parser', () => { const referenceResult = JSON.parse( await fsPromises.readFile( - require.resolve('./data/spec_eden.result.json'), + require.resolve('../test/data/spec_eden.result.json'), 'utf8', ), ) @@ -138,18 +138,18 @@ describe('GFF3 parser', () => { }) it('can parse an excerpt of the refGene gff3', async () => { - const stuff = await readAll('./data/refGene_excerpt.gff3') + const stuff = await readAll('../test/data/refGene_excerpt.gff3') expect(true).toBeTruthy() expect(stuff.all).toHaveLength(2) }) it('can parse an excerpt of the TAIR10 gff3', async () => { - const stuff = await readAll('./data/tair10.gff3') + const stuff = await readAll('../test/data/tair10.gff3') expect(stuff.all).toHaveLength(3) }) it('can parse chr1 TAIR10 gff3', async () => { - const stuff = await readAll('./data/tair10_chr1.gff', { + const stuff = await readAll('../test/data/tair10_chr1.gff', { disableDerivesFromReferences: true, }) expect(stuff.all).toHaveLength(17697) @@ -159,7 +159,7 @@ describe('GFF3 parser', () => { ;['mm9_sample_ensembl.gff3', 'Saccharomyces_cerevisiae_EF3_e64.gff3'].forEach( (errorFile) => { it(`throws an error when parsing ${errorFile}`, async () => { - await expect(readAll(`./data/${errorFile}`)).rejects.toMatch( + await expect(readAll(`../test/data/${errorFile}`)).rejects.toMatch( /inconsistent types/, ) }) @@ -168,7 +168,7 @@ describe('GFF3 parser', () => { it('can parse a string synchronously', async () => { const gff3 = await fsPromises.readFile( - require.resolve('./data/spec_eden.gff3'), + require.resolve('../test/data/spec_eden.gff3'), 'utf8', ) const result = gff.parseStringSync(gff3, { @@ -179,7 +179,7 @@ describe('GFF3 parser', () => { expect(result).toHaveLength(3) const referenceResult = JSON.parse( await fsPromises.readFile( - require.resolve('./data/spec_eden.result.json'), + require.resolve('../test/data/spec_eden.result.json'), 'utf8', ), ) @@ -290,7 +290,7 @@ SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . Alias=Solyc01g0988 ], ].forEach(([filename, expectedOutput]) => { it(`can parse FASTA sections in hybrid ${filename} file`, async () => { - const stuff = await readAll(`./data/${filename}`) + const stuff = await readAll(`../test/data/${filename}`) expect(stuff.sequences).toEqual(expectedOutput) }) }) diff --git a/test/utils.test.ts b/src/util.test.ts similarity index 97% rename from test/utils.test.ts rename to src/util.test.ts index 8a9f575..3a6b5f8 100644 --- a/test/utils.test.ts +++ b/src/util.test.ts @@ -1,5 +1,5 @@ -import gff from '../src' -import { GFF3FeatureLine } from '../src/util' +import gff from '.' +import { GFF3FeatureLine } from './util' const { parseAttributes, From bc8436413ad858ecec2d1eb2f64b871f3fd24376 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 11 Apr 2023 15:28:21 -0600 Subject: [PATCH 12/17] Fold FASTA lines to length 80 when formatting --- src/util.ts | 13 ++++++++++--- test/data/hybrid1.reformatted.gff3 | 3 ++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/util.ts b/src/util.ts index c899e68..2cd4c30 100644 --- a/src/util.ts +++ b/src/util.ts @@ -272,9 +272,16 @@ export function formatComment(comment: GFF3Comment): string { * @returns Formatted single FASTA sequence string */ export function formatSequence(seq: GFF3Sequence): string { - return `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n${ - seq.sequence - }\n` + const header = `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n` + // split sequence chunks into lines of length 80 for embedded FASTA + const lineLength = 80 + const numChunks = Math.ceil(seq.sequence.length / lineLength) + const chunks = new Array(numChunks) + for (let i = 0; i < numChunks; i += 1) { + const start = i * lineLength + chunks[i] = seq.sequence.substring(start, start + lineLength) + } + return `${header}${chunks.join('\n')}\n` } function formatSingleItem( diff --git a/test/data/hybrid1.reformatted.gff3 b/test/data/hybrid1.reformatted.gff3 index 5bd53d8..a85d6c2 100644 --- a/test/data/hybrid1.reformatted.gff3 +++ b/test/data/hybrid1.reformatted.gff3 @@ -9,6 +9,7 @@ chr17 UCSC CDS 62469497 62469506 . - 0 Parent=A00469 >A00469 GATTACAGATTACA >zonker -AAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCCAAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCC +AAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCCAAA +AAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCC >zeebo this is a test description AAAAACTAGTAGCTAGCTAGCTGATCATAGATCGATGCATGGCATACTGACTGATCGACCCCCC From d2f632e02ce1630d18f5d45584b399a991eadbd5 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Mon, 10 Apr 2023 16:46:03 -0600 Subject: [PATCH 13/17] Update test imports --- src/api.test.ts | 18 +++++++++--------- src/index.ts | 1 + src/parse.test.ts | 28 ++++++++++++++-------------- src/util.test.ts | 20 ++++++++++---------- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/api.test.ts b/src/api.test.ts index 541ee05..f996945 100644 --- a/src/api.test.ts +++ b/src/api.test.ts @@ -1,26 +1,26 @@ -import fsPromises from 'fs/promises' +import { readFile } from 'fs/promises' import { ReadableStream, WritableStream, TransformStream } from 'stream/web' -import tmp from 'tmp-promise' +import { withFile } from 'tmp-promise' import { GFFFormattingTransformer, GFFTransformer, formatSync, parseStringSync, -} from '.' +} from './api' import { FileSource, SyncFileSink } from '../test/util' describe('GFF3 formatting', () => { ;['spec_eden', 'au9_scaffold_subset', 'hybrid1', 'hybrid2'].forEach( (file) => { it(`can roundtrip ${file}.gff3 with formatSync`, async () => { - const inputGFF3 = await fsPromises.readFile( + const inputGFF3 = await readFile( require.resolve(`../test/data/${file}.gff3`), 'utf8', ) const expectedGFF3 = ( - await fsPromises.readFile( + await readFile( require.resolve(`../test/data/${file}.reformatted.gff3`), 'utf8', ) @@ -35,7 +35,7 @@ describe('GFF3 formatting', () => { }) it(`can roundtrip ${file}.gff3 with formatStream`, async () => { - const expectedGFF3 = await fsPromises.readFile( + const expectedGFF3 = await readFile( require.resolve(`../test/data/${file}.reformatted.gff3`), 'utf8', ) @@ -72,7 +72,7 @@ describe('GFF3 formatting', () => { ].forEach((file) => { it(`can roundtrip ${file}.gff3 with formatStream and insertVersionDirective`, async () => { jest.setTimeout(1000) - await tmp.withFile(async (tmpFile) => { + await withFile(async (tmpFile) => { await new ReadableStream( new FileSource(require.resolve(`../test/data/${file}.gff3`)), ) @@ -91,9 +91,9 @@ describe('GFF3 formatting', () => { ) .pipeTo(new WritableStream(new SyncFileSink(tmpFile.fd))) - const resultGFF3 = await fsPromises.readFile(tmpFile.path, 'utf8') + const resultGFF3 = await readFile(tmpFile.path, 'utf8') - const expectedGFF3 = await fsPromises.readFile( + const expectedGFF3 = await readFile( require.resolve(`../test/data/${file}.reformatted.gff3`), 'utf8', ) diff --git a/src/index.ts b/src/index.ts index ea069a2..32e09d7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,7 @@ export type { } from './util' import * as util from './util' +export { util } export const defaultExport = { ...api, diff --git a/src/parse.test.ts b/src/parse.test.ts index eaa35a5..4582c7c 100644 --- a/src/parse.test.ts +++ b/src/parse.test.ts @@ -1,13 +1,13 @@ -import fsPromises from 'fs/promises' +import { readFile } from 'fs/promises' import { ReadableStream, TransformStream } from 'stream/web' -import gff, { GFFTransformer } from '../src' -import { - formatFeature, +import { GFFTransformer, parseStringSync } from './api' +import { formatFeature } from './util' +import type { GFF3Feature, - GFF3Directive, GFF3Comment, + GFF3Directive, GFF3Sequence, -} from '../src/util' +} from './util' import { FileSource } from '../test/util' interface ReadAllResults { @@ -56,7 +56,7 @@ describe('GFF3 parser', () => { it('can parse gff3_with_syncs.gff3', async () => { const stuff = await readAll('../test/data/gff3_with_syncs.gff3') const referenceResult = JSON.parse( - await fsPromises.readFile( + await readFile( require.resolve('../test/data/gff3_with_syncs.result.json'), 'utf8', ), @@ -88,7 +88,7 @@ describe('GFF3 parser', () => { const stuff = await readAll('../test/data/knownGene_out_of_order.gff3') // $p->max_lookback(2); const expectedOutput = JSON.parse( - await fsPromises.readFile( + await readFile( require.resolve('../test/data/knownGene_out_of_order.result.json'), 'utf8', ), @@ -129,7 +129,7 @@ describe('GFF3 parser', () => { expect(mrnaLines[2].child_features).toHaveLength(6) const referenceResult = JSON.parse( - await fsPromises.readFile( + await readFile( require.resolve('../test/data/spec_eden.result.json'), 'utf8', ), @@ -167,18 +167,18 @@ describe('GFF3 parser', () => { ) it('can parse a string synchronously', async () => { - const gff3 = await fsPromises.readFile( + const gff3 = await readFile( require.resolve('../test/data/spec_eden.gff3'), 'utf8', ) - const result = gff.parseStringSync(gff3, { + const result = parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, parseComments: true, }) expect(result).toHaveLength(3) const referenceResult = JSON.parse( - await fsPromises.readFile( + await readFile( require.resolve('../test/data/spec_eden.result.json'), 'utf8', ), @@ -191,7 +191,7 @@ describe('GFF3 parser', () => { SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . multivalue = val1,val2, val3;testing = blah ` - const result = gff.parseStringSync(gff3, { + const result = parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, parseComments: true, @@ -225,7 +225,7 @@ SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . multivalue = val1 SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . Alias=Solyc01g098840;ID=gene:Solyc01g098840.2;Name=Solyc01g098840.2;from_BOGAS=1;length=5178 ` - const result = gff.parseStringSync(gff3, { + const result = parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, parseComments: true, diff --git a/src/util.test.ts b/src/util.test.ts index 3a6b5f8..68a99c6 100644 --- a/src/util.test.ts +++ b/src/util.test.ts @@ -1,17 +1,17 @@ -import gff from '.' -import { GFF3FeatureLine } from './util' - -const { +import { + escape, + escapeColumn, + formatFeature, parseAttributes, parseFeature, parseDirective, - formatFeature, - escapeColumn, -} = gff.util + unescape, + GFF3FeatureLine, +} from './util' describe('GFF3 utils', () => { it('can escape properly', () => { - expect(gff.util.escape(5)).toEqual('5') + expect(escape(5)).toEqual('5') // TODO: should add more escape tests }) it('can parse a bad directive', () => { @@ -20,8 +20,8 @@ describe('GFF3 utils', () => { }).not.toThrow() }) it('can unescape properly', () => { - expect(gff.util.unescape(' ')).toEqual(' ') - expect(gff.util.unescape('5')).toEqual('5') + expect(unescape(' ')).toEqual(' ') + expect(unescape('5')).toEqual('5') // TODO: should add more unescape tests }) const attributeCases: [string, Record][] = [ From 53b4a44199528e24f975e42c4689161e4ab1c5a0 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 11 Apr 2023 15:31:23 -0600 Subject: [PATCH 14/17] Use decode/encodeURIComponent for (un)escape --- src/util.ts | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/util.ts b/src/util.ts index 2cd4c30..26cc17c 100644 --- a/src/util.ts +++ b/src/util.ts @@ -8,15 +8,12 @@ * @returns An unescaped string value */ export function unescape(stringVal: string): string { - return stringVal.replace(/%([0-9A-Fa-f]{2})/g, (_match, seq) => - String.fromCharCode(parseInt(seq, 16)), - ) + return decodeURIComponent(stringVal) } function _escape(regex: RegExp, s: string | number) { - return String(s).replace(regex, (ch) => { - const hex = ch.charCodeAt(0).toString(16).toUpperCase().padStart(2, '0') - return `%${hex}` + return String(s).replaceAll(regex, (ch) => { + return encodeURIComponent(ch).toUpperCase() }) } @@ -27,7 +24,7 @@ function _escape(regex: RegExp, s: string | number) { * @returns An escaped string value */ export function escape(rawVal: string | number): string { - return _escape(/[\n;\r\t=%&,\x00-\x1f\x7f-\xff]/g, rawVal) + return _escape(/[\n;\r\t=%&,\x00-\x1f\x7f]/g, rawVal) } /** @@ -37,7 +34,7 @@ export function escape(rawVal: string | number): string { * @returns An escaped column value */ export function escapeColumn(rawVal: string | number): string { - return _escape(/[\n\r\t%\x00-\x1f\x7f-\xff]/g, rawVal) + return _escape(/[\n\r\t%\x00-\x1f\x7f]/g, rawVal) } /** From 53fbd01aa4e34fdfa7166393f18ae0083f279f42 Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 11 Apr 2023 15:32:10 -0600 Subject: [PATCH 15/17] Util fixes and more thorough tests --- src/util.test.ts | 494 ++++++++++++++++++++++++++++++++++++++++------- src/util.ts | 30 ++- 2 files changed, 438 insertions(+), 86 deletions(-) diff --git a/src/util.test.ts b/src/util.test.ts index 68a99c6..39a1c22 100644 --- a/src/util.test.ts +++ b/src/util.test.ts @@ -1,99 +1,453 @@ import { escape, escapeColumn, + formatAttributes, + formatComment, + formatDirective, formatFeature, + formatItem, + formatSequence, parseAttributes, - parseFeature, parseDirective, + parseFeature, unescape, - GFF3FeatureLine, } from './util' -describe('GFF3 utils', () => { - it('can escape properly', () => { +describe('unescape(stringVal)', () => { + it('handles already unsecaped literals', () => { + expect(unescape(' ')).toEqual(' ') + expect(unescape('5')).toEqual('5') + }) + it('handles escaped characters', () => { + expect(unescape('%09')).toEqual('\t') + expect(unescape('%09%0A%0D%25%3B%3D%26%2C')).toEqual('\t\n\r%;=&,') + }) + it('unescapes everything, including spaces, mixed values, and non-specified characters.', () => { + expect(unescape('escaped space: "%20", unescaped space: " "')).toEqual( + 'escaped space: " ", unescaped space: " "', + ) + }) + it('unescapes an `escape()`d string', () => { + const loremIpsum = + 'value=Lorem ipsum dolor sit amet, consectetur adipiscing elit.' + expect(unescape(escape(loremIpsum))).toEqual(loremIpsum) + }) +}) + +describe('escape(rawval)', () => { + it('escapes spec-specified characters', () => { + expect(escape('\t')).toEqual('%09') + expect(escape('\t\n\r%;=&,')).toEqual('%09%0A%0D%25%3B%3D%26%2C') + }) + it("doesn't escape non-spec-specified characters", () => { + expect(escape(' ')).toEqual(' ') expect(escape(5)).toEqual('5') - // TODO: should add more escape tests + expect(escape('\t\t + : \n')).toEqual('%09%09 + : %0A') }) - it('can parse a bad directive', () => { +}) + +describe('escapeColumn(rawval)', () => { + it('escapes spec-specified column characters', () => { + expect(escapeColumn('\t')).toEqual('%09') + expect(escape('\t\n\r%')).toEqual('%09%0A%0D%25') + }) + it("doesn't escape non-spec-specified characters", () => { + expect(escapeColumn(' ')).toEqual(' ') + expect(escapeColumn(5)).toEqual('5') + expect(escapeColumn(';')).toEqual(';') + expect(escapeColumn('\t\n\r%;=&,')).toEqual('%09%0A%0D%25;=&,') + }) +}) + +describe('parseAttributes(attrString)', () => { + it('parses a simple attribute value', () => { + expect(parseAttributes('foo=bar')).toEqual({ foo: ['bar'] }) + }) + it('parses an attribute with escaped values', () => { + expect(parseAttributes('ID=Beep%2Cbonk%3B+Foo')).toEqual({ + ID: ['Beep,bonk;+Foo'], + }) + }) + it('ignores keys with no value', () => { + expect(parseAttributes('Target=Motif;rnd-family-27\n')).toEqual({ + Target: ['Motif'], + }) + }) + it('parses an empty attribute', () => { + expect(parseAttributes('.')).toEqual({}) + }) +}) + +describe('parseFeature(line)', () => { + it('parses a line with no missing values', () => { + expect( + parseFeature( + 'FooSeq\tbarsource\tmatch\t234\t234\t0\t+\t1\tID=Beep%2Cbonk%3B+Foo\n', + ), + ).toEqual({ + attributes: { ID: ['Beep,bonk;+Foo'] }, + end: 234, + phase: '1', + score: 0, + seq_id: 'FooSeq', + source: 'barsource', + start: 234, + strand: '+', + type: 'match', + }) + }) + it('parses a line with no values', () => { + expect(parseFeature('.\t.\t.\t.\t.\t.\t.\t.\t.\n')).toEqual({ + attributes: null, + end: null, + phase: null, + score: null, + seq_id: null, + source: null, + start: null, + strand: null, + type: null, + }) + }) + it('parses a line with escaped values (excluding column 9)', () => { + const seq_id = 'ctgAt25%' + const source = 'some\nsource' + expect( + parseFeature( + `${escapeColumn(seq_id)}\t${escapeColumn( + source, + )}\tmatch\t300\t400\t.\t.\t.\t.\n`, + ), + ).toEqual({ + attributes: null, + end: 400, + phase: null, + score: null, + seq_id, + source, + start: 300, + strand: null, + type: 'match', + }) + }) +}) + +describe('parseDirective(line)', () => { + it('parses a directive', () => { + expect(parseDirective('##gff-version 3\n')).toEqual({ + directive: 'gff-version', + value: '3', + }) + }) + it('parses a genome-build directive', () => { + expect(parseDirective('##genome-build WormBase ws110\n')).toEqual({ + directive: 'genome-build', + value: 'WormBase ws110', + source: 'WormBase', + buildName: 'ws110', + }) + }) + it('parses a sequence-region directive', () => { + expect(parseDirective('##sequence-region ctg123 1 1497228\n')).toEqual({ + directive: 'sequence-region', + value: 'ctg123 1 1497228', + seq_id: 'ctg123', + start: '1', + end: '1497228', + }) + }) + it('return null if the line is not a directive', () => { + expect( + parseDirective( + 'FooSeq\tbarsource\tmatch\t234\t234\t0\t+\t.\tID=Beep%2Cbonk%3B+Foo\n', + ), + ).toBeNull() + }) + it("doesn't fail on a bad directive", () => { expect(() => { parseDirective('##sequence-region no_start_end_on_sequence_region') }).not.toThrow() }) - it('can unescape properly', () => { - expect(unescape(' ')).toEqual(' ') - expect(unescape('5')).toEqual('5') - // TODO: should add more unescape tests - }) - const attributeCases: [string, Record][] = [ - ['foo=bar', { foo: ['bar'] }], - ['ID=Beep%2Cbonk%3B+Foo\n', { ID: ['Beep,bonk;+Foo'] }], - ['Target=Motif;rnd-family-27\n', { Target: ['Motif'] }], - ] - attributeCases.forEach(([input, output]) => { - it(`parses attr string ${input} correctly`, () => { - expect(parseAttributes(input)).toEqual(output) - }) +}) + +describe('formatAttributes(attrs)', () => { + it('formats a simple attribute value', () => { + expect(formatAttributes({ foo: ['bar'] })).toEqual('foo=bar') + }) + it('parses an attribute with escaped values and ignores keys with no value', () => { + expect(formatAttributes({ ID: ['Beep,bonk;+Foo'] })).toEqual( + 'ID=Beep%2Cbonk%3B+Foo', + ) }) + it('parses an empty attribute', () => { + expect(formatAttributes({})).toEqual('.') + }) +}) - const tests: [string, GFF3FeatureLine][] = [ - [ - `FooSeq\tbarsource\tmatch\t234\t234\t0\t+\t.\tID=Beep%2Cbonk%3B+Foo\n`, - { - attributes: { - ID: ['Beep,bonk;+Foo'], - }, +describe('formatFeature(featureOrFeatures)', () => { + it('formats a single-line feature', () => { + expect( + formatFeature({ + attributes: { ID: ['Beep,bonk;+Foo'] }, end: 234, - phase: null, + phase: '1', score: 0, seq_id: 'FooSeq', source: 'barsource', start: 234, strand: '+', type: 'match', - }, - ], - [ - `${escapeColumn( - 'Noggin,+-%Foo\tbar', - )}\tbarsource\tmatch\t234\t234\t0\t+\t.\t.\n`, - { - attributes: {}, - end: 234, - phase: null, - score: 0, - seq_id: 'Noggin,+-%Foo\tbar', - source: 'barsource', - start: 234, + }), + ).toEqual( + 'FooSeq\tbarsource\tmatch\t234\t234\t0\t+\t1\tID=Beep%2Cbonk%3B+Foo\n', + ) + }) + it('formats a feature with a child', () => { + expect( + formatFeature({ + seq_id: 'ctgA', + source: 'est', + type: 'EST_match', + start: 1050, + end: 3202, + score: null, strand: '+', - type: 'match', - }, - ], - ] - tests.forEach(([input, output]) => { - it(`roundtrips feature line ${input} correctly`, () => { - expect(parseFeature(input)).toEqual(output) - expect(formatFeature(output)).toEqual(input) - expect(formatFeature(parseFeature(input))).toEqual(input) - expect(parseFeature(formatFeature(output))).toEqual(output) - }) + phase: null, + attributes: { ID: ['Match1'] }, + child_features: [ + [ + { + seq_id: 'ctgA', + source: 'est', + type: 'match_part', + start: 1050, + end: 1500, + score: null, + strand: '+', + phase: null, + attributes: { Parent: ['Match1'] }, + child_features: [], + derived_features: [], + }, + ], + ], + derived_features: [], + }), + ).toEqual( + `ctgA\test\tEST_match\t1050\t3202\t.\t+\t.\tID=Match1 +ctgA\test\tmatch_part\t1050\t1500\t.\t+\t.\tParent=Match1 +`, + ) + }) + it('formats a feature with multiple locations', () => { + expect( + formatFeature([ + { + seq_id: 'ctg123', + source: null, + type: 'CDS', + start: 1201, + end: 1500, + score: null, + strand: '+', + phase: '0', + attributes: { ID: ['cds00001'], Parent: ['mRNA00001'] }, + child_features: [], + derived_features: [], + }, + { + seq_id: 'ctg123', + source: null, + type: 'CDS', + start: 3000, + end: 3902, + score: null, + strand: '+', + phase: '0', + attributes: { ID: ['cds00001'], Parent: ['mRNA00001'] }, + child_features: [], + derived_features: [], + }, + ]), + ).toEqual( + `ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00001;Parent=mRNA00001 +ctg123\t.\tCDS\t3000\t3902\t.\t+\t0\tID=cds00001;Parent=mRNA00001 +`, + ) + }) + it('formats a feature with the same child in child_features and derived_features', () => { + expect( + formatFeature([ + { + seq_id: 'ctgA', + source: 'est', + type: 'parent', + start: 1050, + end: 3202, + score: null, + strand: '+', + phase: null, + attributes: { ID: ['Feature1'] }, + child_features: [ + [ + { + seq_id: 'ctgA', + source: 'est', + type: 'child', + start: 1050, + end: 1500, + score: null, + strand: '+', + phase: null, + attributes: { + Parent: ['Feature1'], + Derives_from: ['Feature1'], + }, + child_features: [], + derived_features: [], + }, + ], + ], + derived_features: [ + [ + { + seq_id: 'ctgA', + source: 'est', + type: 'child', + start: 1050, + end: 1500, + score: null, + strand: '+', + phase: null, + attributes: { + Parent: ['Feature1'], + Derives_from: ['Feature1'], + }, + child_features: [], + derived_features: [], + }, + ], + ], + }, + ]), + ).toEqual(`ctgA\test\tparent\t1050\t3202\t.\t+\t.\tID=Feature1 +ctgA\test\tchild\t1050\t1500\t.\t+\t.\tParent=Feature1;Derives_from=Feature1 +`) + }) +}) + +describe('formatDirective(directive)', () => { + it('formats a directive', () => { + expect(formatDirective({ directive: 'gff-version', value: '3' })).toEqual( + '##gff-version 3\n', + ) + }) + it('formats a genome-build directive', () => { + expect( + formatDirective({ + directive: 'genome-build', + value: 'WormBase ws110', + source: 'WormBase', + buildName: 'ws110', + }), + ).toEqual('##genome-build WormBase ws110\n') + }) + it('formats a sequence-region directive', () => { + expect( + formatDirective({ + directive: 'sequence-region', + value: 'ctg123 1 1497228', + seq_id: 'ctg123', + start: '1', + end: '1497228', + }), + ).toEqual('##sequence-region ctg123 1 1497228\n') + }) +}) + +describe('formatComment(comment)', () => { + it('formats a comment', () => { + expect(formatComment({ comment: 'my comment' })).toEqual('# my comment\n') }) }) -// is( gff3_format_attributes( undef ), '.', 'format undef attributes' ); -// is( gff3_format_attributes( {} ), '.', 'format empty attributes' ); -// is( gff3_format_attributes( { zee => 'zoz' } ), 'zee=zoz', 'format malformed attributes' ); -// is( gff3_format_attributes( { Alias => [], ID => ['Jim'] } ), 'ID=Jim', 'skip empty attributes' ); -// is( gff3_format_attributes( { Alias => [], ID => ['Jim'], Alias => ['Bob'], fogbat => ['noggin'], '01base' => ['free'], } ), 'ID=Jim;Alias=Bob;01base=free;fogbat=noggin', 'ID is forced to be first-printed attr' ); -// is( gff3_format_attributes( { ID => 'Bob', zee => undef } ), 'ID=Bob', 'also skip undef attributes 1' ); -// is( gff3_format_attributes( { ID => 'Bob', zee => [ undef ] } ), 'ID=Bob', 'also skip undef attributes 2' ); +describe('formatSequence(seq)', () => { + it('formats a short sequence', () => { + expect( + formatSequence({ + id: 'ctg123', + description: 'test contig', + sequence: 'ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA', + }), + ).toEqual( + `>ctg123 test contig +ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA +`, + ) + }) + it('formats a sequence with no description', () => { + expect( + formatSequence({ + id: 'ctg123', + sequence: 'ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA', + }), + ).toEqual( + `>ctg123 +ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA +`, + ) + }) + it('folds sequences longer than 80 characters', () => { + expect( + formatSequence({ + id: 'ctg123', + sequence: + 'ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCAACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA', + }), + ).toEqual( + `>ctg123 +ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCAACTGACTAGCTAGCATCAGCGTCGTAGCTATTA +TATTACGGTAGCCA +`, + ) + }) +}) -// is_deeply( -// gff3_parse_directive(" ## sequence-region contig12321_2.3 23,232 24,435,432"), -// { directive => 'sequence-region', -// seq_id => 'contig12321_2.3', -// start => 23_232, -// end => 24_435_432, -// value => 'contig12321_2.3 23,232 24,435,432', -// }, -// 'gff3_parse_directive seems to work', -// ); +describe('formatItem(itemOrItems)', () => { + it('formats a single item', () => { + expect(formatItem({ directive: 'gff-version', value: '3' })).toEqual( + '##gff-version 3\n', + ) + }) + it('formats an array of items', () => { + expect( + formatItem([ + { directive: 'gff-version', value: '3' }, + { comment: 'my comment' }, + { + attributes: { ID: ['Beep,bonk;+Foo'] }, + end: 234, + phase: '1', + score: 0, + seq_id: 'FooSeq', + source: 'barsource', + start: 234, + strand: '+', + type: 'match', + child_features: [], + derived_features: [], + }, + { + id: 'ctg123', + description: 'test contig', + sequence: 'ACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA', + }, + ]), + ).toEqual([ + '##gff-version 3\n', + '# my comment\n', + 'FooSeq\tbarsource\tmatch\t234\t234\t0\t+\t1\tID=Beep%2Cbonk%3B+Foo\n', + '>ctg123 test contig\nACTGACTAGCTAGCATCAGCGTCGTAGCTATTATATTACGGTAGCCA\n', + ]) + }) +}) diff --git a/src/util.ts b/src/util.ts index 26cc17c..e897b3e 100644 --- a/src/util.ts +++ b/src/util.ts @@ -80,7 +80,10 @@ export function parseAttributes(attrString: string): GFF3Attributes { */ export function parseFeature(line: string): GFF3FeatureLine { // split the line into columns and replace '.' with null in each column - const f = line.split('\t').map((a) => (a === '.' || a === '' ? null : a)) + const f = line + .trim() + .split('\t') + .map((a) => (a === '.' || a === '' ? null : a)) // unescape only the ref, source, and type columns const parsed: GFF3FeatureLine = { @@ -152,17 +155,7 @@ export function parseDirective( export function formatAttributes(attrs: GFF3Attributes): string { const attrOrder: string[] = [] Object.entries(attrs).forEach(([tag, val]) => { - if (!val) return - let valstring - if (val.hasOwnProperty('toString')) { - valstring = escape(val.toString()) - // } else if (Array.isArray(val.values)) { - // valstring = val.values.map(escape).join(',') - } else if (Array.isArray(val)) { - valstring = val.map(escape).join(',') - } else { - valstring = escape(val) - } + const valstring = val.map(escape).join(',') attrOrder.push(`${escape(tag)}=${valstring}`) }) return attrOrder.length ? attrOrder.join(';') : '.' @@ -324,7 +317,7 @@ export function formatItem( } /** A record of GFF3 attribute identifiers and the values of those identifiers */ -export type GFF3Attributes = Record +export type GFF3Attributes = Record /** A representation of a single line of a GFF3 file */ export interface GFF3FeatureLine { @@ -374,7 +367,7 @@ function _isFeatureLineWithRefs( export type GFF3Feature = GFF3FeatureLineWithRefs[] /** A GFF3 directive */ -export interface GFF3Directive { +export interface BaseGFF3Directive { /** The name of the directive */ directive: string /** The string value of the directive */ @@ -382,7 +375,7 @@ export interface GFF3Directive { } /** A GFF3 sequence-region directive */ -export interface GFF3SequenceRegionDirective extends GFF3Directive { +export interface GFF3SequenceRegionDirective extends BaseGFF3Directive { /** The string value of the directive */ value: string /** The sequence ID parsed from the directive */ @@ -394,7 +387,7 @@ export interface GFF3SequenceRegionDirective extends GFF3Directive { } /** A GFF3 genome-build directive */ -export interface GFF3GenomeBuildDirective extends GFF3Directive { +export interface GFF3GenomeBuildDirective extends BaseGFF3Directive { /** The string value of the directive */ value: string /** The genome build source parsed from the directive */ @@ -403,6 +396,11 @@ export interface GFF3GenomeBuildDirective extends GFF3Directive { buildName: string } +export type GFF3Directive = + | BaseGFF3Directive + | GFF3SequenceRegionDirective + | GFF3GenomeBuildDirective + /** A GFF3 comment */ export interface GFF3Comment { /** The text of the comment */ From 549bd9fa7fe6741f992cf93c49c48d42080d01ca Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 11 Apr 2023 15:33:47 -0600 Subject: [PATCH 16/17] Add a comment to one of the test GFF3s --- src/parse.test.ts | 2 +- test/data/hybrid1.gff3 | 1 + test/data/hybrid1.reformatted.gff3 | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/parse.test.ts b/src/parse.test.ts index 4582c7c..e8effed 100644 --- a/src/parse.test.ts +++ b/src/parse.test.ts @@ -69,7 +69,7 @@ describe('GFF3 parser', () => { [51, 'au9_scaffold_subset.gff3'], [14, 'tomato_chr4_head.gff3'], [5, 'directives.gff3'], - [5, 'hybrid1.gff3'], + [6, 'hybrid1.gff3'], [3, 'hybrid2.gff3'], [6, 'knownGene.gff3'], [6, 'knownGene2.gff3'], diff --git a/test/data/hybrid1.gff3 b/test/data/hybrid1.gff3 index 7995cde..9927e9d 100644 --- a/test/data/hybrid1.gff3 +++ b/test/data/hybrid1.gff3 @@ -1,4 +1,5 @@ ##gff-version 3 +# This is a comment chr17 UCSC mRNA 62467934 62469545 . - . ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1 chr17 UCSC CDS 62468039 62468236 . - 1 Parent=A00469 chr17 UCSC CDS 62468490 62468654 . - 2 Parent=A00469 diff --git a/test/data/hybrid1.reformatted.gff3 b/test/data/hybrid1.reformatted.gff3 index a85d6c2..230e648 100644 --- a/test/data/hybrid1.reformatted.gff3 +++ b/test/data/hybrid1.reformatted.gff3 @@ -1,4 +1,5 @@ ##gff-version 3 +# This is a comment chr17 UCSC mRNA 62467934 62469545 . - . ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:SOMA_HUMAN;Note=growth hormone 1;Alias=GH1 chr17 UCSC CDS 62468039 62468236 . - 1 Parent=A00469 chr17 UCSC CDS 62468490 62468654 . - 2 Parent=A00469 From 226a50a14429591816b6c95a742066a6ea3fc0ae Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Tue, 11 Apr 2023 15:34:38 -0600 Subject: [PATCH 17/17] Standardize imports --- src/api.ts | 11 ++++++----- src/parse.ts | 47 +++++++++++++++++++++++++++-------------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/api.ts b/src/api.ts index 80a3df4..d81dfa7 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,4 +1,4 @@ -import Parser, { ParseCallbacks } from './parse' +import { GFF3Parser, ParseCallbacks } from './parse' import { formatItem, formatSequence, @@ -48,7 +48,7 @@ function _processParseOptions(options: ParseOptions): ParseOptionsProcessed { */ export class GFFTransformer implements Transformer { private decoder: TextDecoder - private parser: Parser + private parser: GFF3Parser private lastString = '' private parseFeatures: boolean private parseDirectives: boolean @@ -63,7 +63,7 @@ export class GFFTransformer implements Transformer { this.decoder = new TextDecoder() const processedOptions = _processParseOptions(options) const { bufferSize, disableDerivesFromReferences } = processedOptions - this.parser = new Parser({ bufferSize, disableDerivesFromReferences }) + this.parser = new GFF3Parser({ bufferSize, disableDerivesFromReferences }) this.parseFeatures = processedOptions.parseFeatures this.parseDirectives = processedOptions.parseDirectives this.parseComments = processedOptions.parseComments @@ -462,7 +462,7 @@ export function parseStringSync( callbacks.sequenceCallback = push } - const parser = new Parser({ + const parser = new GFF3Parser({ disableDerivesFromReferences: options.disableDerivesFromReferences || false, bufferSize: Infinity, }) @@ -532,7 +532,8 @@ export class GFFFormattingTransformer implements Transformer { */ constructor(options: FormatOptions = {}) { this.minLinesBetweenSyncMarks = options.minSyncLines || 100 - this.insertVersionDirective = options.insertVersionDirective || true + this.insertVersionDirective = + options.insertVersionDirective === false ? false : true } transform( diff --git a/src/parse.ts b/src/parse.ts index 0c790fc..0f0cc86 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -1,4 +1,12 @@ -import * as GFF3 from './util' +import { + parseDirective, + parseFeature, + GFF3Feature, + GFF3FeatureLineWithRefs, + GFF3Comment, + GFF3Directive, + GFF3Sequence, +} from './util' const containerAttributes = { Parent: 'child_features' as const, @@ -6,11 +14,11 @@ const containerAttributes = { } export interface ParseCallbacks { - featureCallback?(feature: GFF3.GFF3Feature): void - commentCallback?(comment: GFF3.GFF3Comment): void + featureCallback?(feature: GFF3Feature): void + commentCallback?(comment: GFF3Comment): void errorCallback?(error: string): void - directiveCallback?(directive: GFF3.GFF3Directive): void - sequenceCallback?(sequence: GFF3.GFF3Sequence): void + directiveCallback?(directive: GFF3Directive): void + sequenceCallback?(sequence: GFF3Sequence): void } export class FASTAParser { @@ -20,7 +28,7 @@ export class FASTAParser { constructor( // eslint-disable-next-line @typescript-eslint/no-empty-function - private seqCallback: (sequence: GFF3.GFF3Sequence) => void = () => {}, + private seqCallback: (sequence: GFF3Sequence) => void = () => {}, ) { this.currentSequence = undefined } @@ -52,11 +60,11 @@ interface ParserArgs { } interface References { - Parent: GFF3.GFF3Feature[] - Derives_from: GFF3.GFF3Feature[] + Parent: GFF3Feature[] + Derives_from: GFF3Feature[] } -export default class Parser { +export class GFF3Parser { endCallback: () => void disableDerivesFromReferences: boolean bufferSize: number @@ -67,10 +75,9 @@ export default class Parser { lineNumber = 0 // features that we have to keep on hand for now because they might be // referenced by something else - private underConstructionTopLevel: GFF3.GFF3Feature[] = [] + private underConstructionTopLevel: GFF3Feature[] = [] // index of the above by ID - private underConstructionById: Record = - {} + private underConstructionById: Record = {} private completedReferences: Record< string, Record | undefined @@ -125,7 +132,7 @@ export default class Parser { // sync directive, all forward-references are resolved. this.emitAllUnderConstructionFeatures(callbacks) } else if (hashsigns.length === 2) { - const directive = GFF3.parseDirective(line) + const directive = parseDirective(line) if (directive) { if (directive.directive === 'FASTA') { this.emitAllUnderConstructionFeatures(callbacks) @@ -161,7 +168,7 @@ export default class Parser { } private emitItem( - i: GFF3.GFF3Feature | GFF3.GFF3Directive | GFF3.GFF3Comment, + i: GFF3Feature | GFF3Directive | GFF3Comment, callbacks: ParseCallbacks, ) { if (Array.isArray(i) && callbacks.featureCallback) @@ -176,7 +183,7 @@ export default class Parser { additionalItemCount = 0, callbacks: ParseCallbacks, ) { - const _unbufferItem = (item?: GFF3.GFF3Feature) => { + const _unbufferItem = (item?: GFF3Feature) => { if ( item && Array.isArray(item) && @@ -236,8 +243,8 @@ export default class Parser { // do the right thing with a newly-parsed feature line private bufferLine(line: string, callbacks: ParseCallbacks) { - const rawFeatureLine = GFF3.parseFeature(line) - const featureLine: GFF3.GFF3FeatureLineWithRefs = { + const rawFeatureLine = parseFeature(line) + const featureLine: GFF3FeatureLineWithRefs = { ...rawFeatureLine, child_features: [], derived_features: [], @@ -257,7 +264,7 @@ export default class Parser { return } - let feature: GFF3.GFF3Feature | undefined = undefined + let feature: GFF3Feature | undefined = undefined ids.forEach((id) => { const existing = this.underConstructionById[id] if (existing) { @@ -296,7 +303,7 @@ export default class Parser { ) } - private resolveReferencesTo(feature: GFF3.GFF3Feature, id: string) { + private resolveReferencesTo(feature: GFF3Feature, id: string) { const references = this.underConstructionOrphans.get(id) // references is of the form // { @@ -335,7 +342,7 @@ export default class Parser { } private resolveReferencesFrom( - feature: GFF3.GFF3Feature, + feature: GFF3Feature, references: { Parent: string[]; Derives_from: string[] }, ids: string[], ) {