From d6168a7f395edcc6d7f43bd94b6329a317a77823 Mon Sep 17 00:00:00 2001 From: Steffengreiner Date: Mon, 12 Dec 2022 13:44:04 +0100 Subject: [PATCH] Introduce new Nanopore schema to account for files generated due to second basecalling (#344) * WIP New nanopore schema accounting for second basecalling folder * Finalize nanopore schema accounting for second basecalling folder * Add minItems to ensure that all files are provided * Provide datastructure model in readme for basecalling folder * apply review suggestion Co-authored-by: wow-such-code * Update v3 Readme Co-authored-by: wow-such-code * Update v3 Readme Co-authored-by: wow-such-code * Provide datastructure model in readme for basecalling folder Co-authored-by: wow-such-code --- README.md | 4 + .../Nanopore_Data_Structure_Model_v4.svg | 4 + pom.xml | 2 +- .../datasets/OxfordNanoporeExperiment.groovy | 7 +- .../files/nanopore/GuppyBasecallLog.groovy | 30 + .../nanopore/SequencingTelemetryLog.groovy | 32 + .../folders/nanopore/BasecallingFolder.groovy | 42 ++ .../OxfordNanoporeInstrumentOutputV4.groovy | 22 + .../nanopore-instrument-output.schema.json | 2 +- .../nanopore-instrument-output_v2.schema.json | 2 +- .../nanopore-instrument-output_v3.schema.json | 2 +- .../nanopore-instrument-output_v4.schema.json | 635 ++++++++++++++++++ .../OxfordNanoporeExperimentSpec.groovy | 23 + .../nanopore/GuppyBasecallLogSpec.groovy | 37 + .../SequencingTelemetryLogSpec.groovy | 37 + .../nanopore/BasecallingFolderSpec.groovy | 43 ++ .../valid-example-v4-with-basecalling.json | 267 ++++++++ 17 files changed, 1185 insertions(+), 6 deletions(-) create mode 100644 doc/figures/Nanopore_Data_Structure_Model_v4.svg create mode 100644 src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLog.groovy create mode 100644 src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLog.groovy create mode 100644 src/main/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolder.groovy create mode 100644 src/main/groovy/life/qbic/datamodel/instruments/OxfordNanoporeInstrumentOutputV4.groovy create mode 100644 src/main/resources/schemas/nanopore-instrument-output_v4.schema.json create mode 100644 src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLogSpec.groovy create mode 100644 src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLogSpec.groovy create mode 100644 src/test/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolderSpec.groovy create mode 100644 src/test/resources/nanopore/valid-example-v4-with-basecalling.json diff --git a/README.md b/README.md index d97d27b359..752a60c58c 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,10 @@ A more recent model, which places two of the configuration files into a subfolde ![Nanopore Data Structure Model v2](./doc/figures/Nanopore_Data_Structure_Model_v2.svg) +V4 outlines a model in which a second higher-accuracy basecalling was performed after the initial basecalling + +![Nanopore Data Structure Model v4](./doc/figures/Nanopore_Data_Structure_Model_v4.svg) + #### Nanopore usage example For usage examples, see the [usage documentation](./doc/examples.md). diff --git a/doc/figures/Nanopore_Data_Structure_Model_v4.svg b/doc/figures/Nanopore_Data_Structure_Model_v4.svg new file mode 100644 index 0000000000..4c1602816e --- /dev/null +++ b/doc/figures/Nanopore_Data_Structure_Model_v4.svg @@ -0,0 +1,4 @@ + + + +
Root Folder
(OxfordNanoporeExperiment)
Root Folder...
Measurement Folder
(OxfordNanoporeMeasurement)
Measurement Folder...
1
1
1..n
1..n
FastQ Fail Folder
FastQ Fail Folder
FastQ Pass Folder
FastQ Pass Folder
Fast5 Pass Folder
Fast5 Pass Folder
Fast5 Fail Folder
Fast5 Fail Folder
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
Sequencing Summary Log
Sequencing Summary Log
Duty Time Log
Duty Time Log
Final Summary Log
Final Summary Log
Throughput Log
Throughput Log
Report MD Log
Report MD Log
Report PDF Log
Report PDF Log
Drift Correction Log
Drift Correction Log
Mux Scan Data Log
Mux Scan Data Log
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
FastQ Folder
FastQ Folder
FastQ File
FastQ File
0..n
0..n
DataFile
DataFile
BarcodedFolder
BarcodedFolder
Extends
Extends
Extends
Extends
Data File
Data File
Extends
Extends
DataFolder
DataFolder
Extends
Extends
DataFolder
DataFolder
Extends
Extends
Unclassified Folder
Unclassified Folder
1
1
0..n
0..n
0..n
0..n
FastQ File
FastQ File
1
1
0..n
0..n
1
1
0..n
0..n
FastQ Folder
FastQ Folder
FastQ File
FastQ File
Unclassified Folder
Unclassified Folder
1
1
0..n
0..n
FastQ File
FastQ File
1
1
0..n
0..n
Fast5 Folder
Fast5 Folder
Fast5 File
Fast5 File
Unclassified Folder
Unclassified Folder
1
1
0..n
0..n
Fast5 File
Fast5 File
1
1
0..n
0..n
Fast5 Folder
Fast5 Folder
Fast5 File
Fast5 File
Unclassified Folder
Unclassified Folder
1
1
0..n
0..n
Fast5 File
Fast5 File
1
1
0..n
0..n
0..n
0..n
0..n
0..n
1
1
0..n
0..n
1
1
0..n
0..n
0..n
0..n
1
1
0..n
0..n
0..n
0..n
0..n
0..n
FastQ Folder
FastQ Folder
Fast5 Folder
Fast5 Folder
DataFolder
DataFolder
Sequencing Summary Log
Sequencing Summary Log
Sequencing Telemetry Log
Sequencing Telemetry Log
Guppy Basecalling
Client Log
Guppy Basecalling...
Fastq Fail Folder
Fastq Fail Folder
1
1
1
1
1
1
1
1
1
1
Data File
Data File
Extends
Extends
FastQ Folder
FastQ Folder
Unclassified Folder
Unclassified Folder
FastQ File
FastQ File
FastQ File
FastQ File
1
1
0..n
0..n
1
1
0..n
0..n
0..n
0..n
0..n
0..n
1
1
0..n
0..n
Fastq Pass Folder
Fastq Pass Folder
FastQ Folder
FastQ Folder
Unclassified Folder
Unclassified Folder
FastQ File
FastQ File
FastQ File
FastQ File
1
1
0..n
0..n
1
1
0..n
0..n
1
1
0..n
0..n
0..n
0..n
1
1
0..n
0..n
Basecalling
Basecalling
DataFolder
DataFolder
Extends
Extends
1
1
Text is not SVG - cannot display
\ No newline at end of file diff --git a/pom.xml b/pom.xml index 54d10a8cee..9996e7e362 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ life.qbic data-model-lib - 2.23.0-SNAPSHOT + 2.24.0-SNAPSHOT data-model-lib http://github.com/qbicsoftware/data-model-lib Data models. A collection of QBiC's central data models and DTOs. diff --git a/src/main/groovy/life/qbic/datamodel/datasets/OxfordNanoporeExperiment.groovy b/src/main/groovy/life/qbic/datamodel/datasets/OxfordNanoporeExperiment.groovy index e4f0146f93..057834dc29 100644 --- a/src/main/groovy/life/qbic/datamodel/datasets/OxfordNanoporeExperiment.groovy +++ b/src/main/groovy/life/qbic/datamodel/datasets/OxfordNanoporeExperiment.groovy @@ -40,7 +40,9 @@ final class OxfordNanoporeExperiment implements ExperimentFolder { FQDN_FILES + ".BarcodeAlignmentLog", FQDN_FILES + ".PoreActivityLog", FQDN_FILES + ".SampleSheetLog", - FQDN_FILES + ".PoreScanDataLog" + FQDN_FILES + ".PoreScanDataLog", + FQDN_FILES + ".SequencingTelemetryLog", + FQDN_FILES + ".GuppyBasecallLog" ] private final static Set nanoporeFolderTypes = [ @@ -52,7 +54,8 @@ final class OxfordNanoporeExperiment implements ExperimentFolder { FQDN_FOLDERS + ".FastQFailFolder", FQDN_FOLDERS + ".UnclassifiedFast5Folder", FQDN_FOLDERS + ".UnclassifiedFastQFolder", - FQDN_FOLDERS + ".OtherReportsFolder" + FQDN_FOLDERS + ".OtherReportsFolder", + FQDN_FOLDERS + ".BasecallingFolder" ] private OxfordNanoporeExperiment(String sampleId, List measurements) { diff --git a/src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLog.groovy b/src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLog.groovy new file mode 100644 index 0000000000..c749b0564c --- /dev/null +++ b/src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLog.groovy @@ -0,0 +1,30 @@ +package life.qbic.datamodel.datasets.datastructure.files.nanopore + +import life.qbic.datamodel.datasets.datastructure.files.DataFile + +/** + * A specialisation of a DataFile, represents an Oxford Nanopore guppy basecalling client log file + */ +class GuppyBasecallLog extends DataFile { + + final private static String FILE_TYPE = "log" + + final private static String NAME_SCHEMA = $/guppy_basecall_client_log-.*/$ + + protected GuppyBasecallLog() {} + + protected GuppyBasecallLog(String name, String relativePath) { + super(name, relativePath, FILE_TYPE) + validateName() + } + + static GuppyBasecallLog create(String name, String relativePath) { + return new GuppyBasecallLog(name, relativePath) + } + + private void validateName() { + if (!(this.name =~ NAME_SCHEMA)) { + throw new IllegalArgumentException("Name must match the Nanopore guppy basecall client log schema!") + } + } +} diff --git a/src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLog.groovy b/src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLog.groovy new file mode 100644 index 0000000000..2ae6d07cf5 --- /dev/null +++ b/src/main/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLog.groovy @@ -0,0 +1,32 @@ +package life.qbic.datamodel.datasets.datastructure.files.nanopore + +import life.qbic.datamodel.datasets.datastructure.files.DataFile + +/** + * A specialisation of a DataFile, represents an Oxford Nanopore sequencing telemetry log file + * + */ +class SequencingTelemetryLog extends DataFile { + + final private static String FILE_TYPE = "js" + + final private static String NAME_SCHEMA = $/sequencing_telemetry_.*/$ + + protected SequencingTelemetryLog() {} + + protected SequencingTelemetryLog(String name, String relativePath) { + super(name, relativePath, FILE_TYPE) + validateName() + } + + static SequencingTelemetryLog create(String name, String relativePath) { + return new SequencingTelemetryLog(name, relativePath) + } + + private void validateName() { + if (!(this.name =~ NAME_SCHEMA)) { + throw new IllegalArgumentException("Name must match the Nanopore sequencing telemetry log name schema!") + } + } + +} diff --git a/src/main/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolder.groovy b/src/main/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolder.groovy new file mode 100644 index 0000000000..eba6e31afa --- /dev/null +++ b/src/main/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolder.groovy @@ -0,0 +1,42 @@ +package life.qbic.datamodel.datasets.datastructure.folders.nanopore + +import life.qbic.datamodel.datasets.datastructure.folders.DataFolder + +/** + * + * + * + * + * @since + * + */ +class BasecallingFolder extends DataFolder { + /** + * The name schema of a basecalling folder contained within the nanopore dataset. + * + */ + final private static String NAME_SCHEMA = /basecalling/ + + protected BasecallingFolder() {} + + protected BasecallingFolder(String name, String relativePath, List children) { + super(name, relativePath, children) + validateName() + } + + /** + * Creates a new instance of a BasecallingFolder object + * @param relativePath The relative path of the folder + * @param children A list with child elements of unknown type of the folder + * @return A new instance of a BasecallingFolder object + */ + static BasecallingFolder create(String name, String relativePath, List children) { + new BasecallingFolder(name, relativePath, children) + } + + private void validateName() { + if (!(this.name =~ NAME_SCHEMA)) { + throw new IllegalArgumentException("Name must match the Nanopore Basecalling schema!") + } + } +} diff --git a/src/main/groovy/life/qbic/datamodel/instruments/OxfordNanoporeInstrumentOutputV4.groovy b/src/main/groovy/life/qbic/datamodel/instruments/OxfordNanoporeInstrumentOutputV4.groovy new file mode 100644 index 0000000000..14f4d5050e --- /dev/null +++ b/src/main/groovy/life/qbic/datamodel/instruments/OxfordNanoporeInstrumentOutputV4.groovy @@ -0,0 +1,22 @@ +package life.qbic.datamodel.instruments + + +/** + * Represents the Nanopore instrument output data structure schema. + * + * The original schema is defined in as resource and is + * referenced here, wrapped in a Groovy class for reference + * in applications that want to validate the instrument + * output structure against the schema. + * + * @author Steffen Greiner + * @since 1.9.0 + */ +class OxfordNanoporeInstrumentOutputV4 { + + private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_v4.schema.json" + + static InputStream getSchemaAsStream() { + return OxfordNanoporeInstrumentOutputV4.getResourceAsStream(SCHEMA_PATH) + } +} diff --git a/src/main/resources/schemas/nanopore-instrument-output.schema.json b/src/main/resources/schemas/nanopore-instrument-output.schema.json index 47a6be3be5..3071d093a0 100644 --- a/src/main/resources/schemas/nanopore-instrument-output.schema.json +++ b/src/main/resources/schemas/nanopore-instrument-output.schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "http://qbic.life/nanopore-instrument-output.schema.json", "title": "Nanopore Instrument Output", - "description": "Describes in which form Nanopore data is received from the lab.", + "description": "Describes in which form PromethION/MinION sequenced sequenced Nanopore is received from the Microbiology lab.", "definitions": { "folder": { "description": "Describes a folder", diff --git a/src/main/resources/schemas/nanopore-instrument-output_v2.schema.json b/src/main/resources/schemas/nanopore-instrument-output_v2.schema.json index 452feeefea..39256876c7 100644 --- a/src/main/resources/schemas/nanopore-instrument-output_v2.schema.json +++ b/src/main/resources/schemas/nanopore-instrument-output_v2.schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "http://qbic.life/nanopore-instrument-output_v2.schema.json", "title": "Nanopore Instrument Output V2", - "description": "Describes in which form Nanopore data is received from the lab.", + "description": "Describes in which form PromethION/MinION sequenced sequenced Nanopore data is received from the medical genetics lab. Accounts for 'other reports' folder created by the lab", "definitions": { "folder": { "description": "Describes a folder", diff --git a/src/main/resources/schemas/nanopore-instrument-output_v3.schema.json b/src/main/resources/schemas/nanopore-instrument-output_v3.schema.json index 456bb037dd..3c656cc524 100644 --- a/src/main/resources/schemas/nanopore-instrument-output_v3.schema.json +++ b/src/main/resources/schemas/nanopore-instrument-output_v3.schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "http://qbic.life/nanopore-instrument-output_v3.schema.json", "title": "Nanopore Instrument Output V3", - "description": "Describes in which form Nanopore data is received from the lab.", + "description": "Describes in which form PromethION/MinION sequenced sequenced Nanopore data is received from the medical genetics lab. Accounts for the adapted 'other_reports' folder structure provided by the lab", "definitions": { "folder": { "description": "Describes a folder", diff --git a/src/main/resources/schemas/nanopore-instrument-output_v4.schema.json b/src/main/resources/schemas/nanopore-instrument-output_v4.schema.json new file mode 100644 index 0000000000..0c22e107a9 --- /dev/null +++ b/src/main/resources/schemas/nanopore-instrument-output_v4.schema.json @@ -0,0 +1,635 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "http://qbic.life/nanopore-instrument-output_v4.schema.json", + "title": "Nanopore Instrument Output V4", + "description": "Describes in which form PromethION/MinION sequenced Nanopore data is received from the microbiology lab. For this dataset a second basecalling with higher accuracy was performed after the an initial fast basecalling during sequencing", + "definitions": { + "folder": { + "description": "Describes a folder", + "type": "object", + "required": [ + "name", + "path", + "children" + ], + "properties": { + "name": { + "description": "Folder name", + "type": "string", + "minLength": 1 + }, + "path": { + "description": "relative folderpath", + "type": "string", + "minLength": 1 + }, + "children": { + "description": "Describes files and/or sub-folders if existent", + "type": "array", + "items": { + "oneOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "$ref": "#/definitions/file" + } + ] + } + } + } + }, + "file": { + "description": "Describes a file", + "type": "object", + "required": [ + "name", + "path", + "file_type" + ], + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "path": { + "type": "string", + "minLength": 1 + }, + "file_type": { + "type": "string", + "minLength": 1 + } + } + }, + "qbic_code": { + "description": "Describes a QBiC code used as a prefix", + "type": "string", + "pattern": "Q\\w{4}\\d{3}[A-X][A-X0-9].*" + }, + "barcoded_folder": { + "description": "folder starting with qbic barcode prefix", + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "$ref": "#/definitions/qbic_code" + } + } + } + ] + }, + "fast5_file": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "file_type": { + "pattern": "fast5" + } + } + } + ] + }, + "fastqgz_file": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "file_type": { + "pattern": "fastq.gz" + } + } + } + ] + }, + "fastq_file": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "file_type": { + "pattern": "fastq" + } + } + } + ] + }, + "unclassified_folder": { + "description": "folder containing unassigned read file(s)", + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "unclassified" + } + } + } + ] + }, + "fast5_unclassified_folder": { + "description": "folder containing fast5 data from a pooling experiment, that could not be assigned to one of the known samples", + "allOf": [ + { + "$ref": "#/definitions/unclassified_folder" + }, + { + "properties": { + "children": { + "items": { + "$ref": "#/definitions/fast5_file" + }, + "minItems": 0 + } + } + } + ] + }, + "fastq_unclassified_folder": { + "description": "folder containing fastq and/or fastq.gz data from a pooling experiment, that could not be assigned to one of the known samples", + "allOf": [ + { + "$ref": "#/definitions/unclassified_folder" + }, + { + "properties": { + "children": { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/fastqgz_file" + }, + { + "$ref": "#/definitions/fastq_file" + } + ] + }, + "minItems": 0 + } + } + } + ] + }, + "fast5_subfolder": { + "description": "folder containing fast5 data from a single sample (only when pooling is used)", + "allOf": [ + { + "$ref": "#/definitions/barcoded_folder" + }, + { + "properties": { + "children": { + "items": { + "$ref": "#/definitions/fast5_file" + }, + "minItems": 1 + } + } + } + ] + }, + "fast5_fail": { + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "fast5_fail" + }, + "children": { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/fast5_subfolder" + }, + { + "$ref": "#/definitions/fast5_unclassified_folder" + }, + { + "$ref": "#/definitions/fast5_file" + } + ] + } + } + } + } + ] + }, + "fast5_pass": { + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "fast5_pass" + }, + "children": { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/fast5_subfolder" + }, + { + "$ref": "#/definitions/fast5_unclassified_folder" + }, + { + "$ref": "#/definitions/fast5_file" + } + ] + } + } + } + } + ] + }, + "fastq_fail": { + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "fastq_fail" + }, + "children": { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/fastq_subfolder" + }, + { + "$ref": "#/definitions/fastq_unclassified_folder" + }, + { + "$ref": "#/definitions/fastqgz_file" + } + ] + } + } + } + } + ] + }, + "fastq_pass": { + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "fastq_pass" + }, + "children": { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/fastq_subfolder" + }, + { + "$ref": "#/definitions/fastq_unclassified_folder" + }, + { + "$ref": "#/definitions/fastqgz_file" + } + ] + } + } + } + } + ] + }, + "fastq_subfolder": { + "description": "folder containing gzipped fastq data from a single sample (only when pooling is used)", + "allOf": [ + { + "$ref": "#/definitions/barcoded_folder" + }, + { + "properties": { + "children": { + "items": { + "$ref": "#/definitions/fastqgz_file" + }, + "minItems": 1 + } + } + } + ] + }, + "basecalling_folder": { + "description": "folder containing the files resulting from a second high accuracy basecalling performed after the initial sequencing", + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "basecalling" + }, + "children": { + "items": { + "uniqueItems": true, + "minItems": 5, + "anyOf": [ + { + "$ref": "#/definitions/fastq_pass" + }, + { + "$ref": "#/definitions/fastq_fail" + }, + { + "$ref": "#/definitions/sequencing_summary_log" + }, + { + "$ref": "#/definitions/sequencing_telemetry" + }, + { + "$ref": "#/definitions/guppy_basecall_client_log" + } + ] + } + } + } + } + ] + }, + "measurements": { + "description": "Top folder generated by the facility, containing one or more timestamped measurements", + "allOf": [ + { + "$ref": "#/definitions/barcoded_folder" + }, + { + "properties": { + "children": { + "items": { + "allOf": [ + { + "$ref": "#/definitions/measurement" + } + ] + }, + "minItems": 1 + } + } + } + ] + }, + "measurement": { + "allOf": [ + { + "$ref": "#/definitions/folder" + }, + { + "properties": { + "name": { + "pattern": "\\d{4}(0?[1-9]|1[012])(0?[1-9]|[12][0-9]|3[01])_([01][0-9]|2[0-3])([0-5][0-9]).*", + "description": "Name of measurement subfolder. Starts with date and time of measurement." + }, + "children": { + "uniqueItems": true, + "minItems": 11, + "items": { + "oneOf": [ + { + "$ref": "#/definitions/fastq_fail" + }, + { + "$ref": "#/definitions/fastq_pass" + }, + { + "$ref": "#/definitions/fast5_pass" + }, + { + "$ref": "#/definitions/fast5_fail" + }, + { + "$ref": "#/definitions/drift_correction_log" + }, + { + "$ref": "#/definitions/duty_time_log" + }, + { + "$ref": "#/definitions/final_summary_log" + }, + { + "$ref": "#/definitions/mux_scan_data_log" + }, + { + "$ref": "#/definitions/report_md_log" + }, + { + "$ref": "#/definitions/report_html_log" + }, + { + "$ref": "#/definitions/sequencing_summary_log" + }, + { + "$ref": "#/definitions/throughput_log" + }, + { + "$ref": "#/definitions/basecalling_folder" + } + ] + } + } + } + } + ] + }, + "drift_correction_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "drift_correction_.*" + }, + "file_type": { + "pattern": "csv" + } + } + } + ] + }, + "duty_time_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "duty_time_.*" + }, + "file_type": { + "pattern": "csv" + } + } + } + ] + }, + "final_summary_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "final_summary_.*" + }, + "file_type": { + "pattern": "txt" + } + } + } + ] + }, + "mux_scan_data_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "mux_scan_data_.*" + }, + "file_type": { + "pattern": "csv" + } + } + } + ] + }, + "report_md_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "report_.*" + }, + "file_type": { + "pattern": "md" + } + } + } + ] + }, + "report_html_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "report_.*" + }, + "file_type": { + "pattern": "html" + } + } + } + ] + }, + "sequencing_summary_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "sequencing_summary_.*" + }, + "file_type": { + "pattern": "txt" + } + } + } + ] + }, + "throughput_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "throughput_.*" + }, + "file_type": { + "pattern": "csv" + } + } + } + ] + }, + "sequencing_telemetry": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "sequencing_telemetry_.*" + }, + "file_type": { + "pattern": "js" + } + } + } + ] + }, + "guppy_basecall_client_log": { + "allOf": [ + { + "$ref": "#/definitions/file" + }, + { + "properties": { + "name": { + "pattern": "guppy_basecall_client_log-.*" + }, + "file_type": { + "pattern": "log" + } + } + } + ] + } + }, + "allOf": [ + { + "$ref": "#/definitions/measurements" + } + ] +} diff --git a/src/test/groovy/life/qbic/datamodel/datasets/datastructure/OxfordNanoporeExperimentSpec.groovy b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/OxfordNanoporeExperimentSpec.groovy index ebb263fc11..391aa0b0c2 100644 --- a/src/test/groovy/life/qbic/datamodel/datasets/datastructure/OxfordNanoporeExperimentSpec.groovy +++ b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/OxfordNanoporeExperimentSpec.groovy @@ -35,6 +35,12 @@ class OxfordNanoporeExperimentSpec extends Specification { */ @Shared Map extendedDataStructureWithReportsFolderV3 + + /** + * Addition to the newest structure, containing a second basecalling run + */ + @Shared + Map extendedDataStructureWithReportsFolderV4 /** * Map that that stores the Oxford Nanopore folder structure * according to the schema containing unclassified read information @@ -61,6 +67,9 @@ class OxfordNanoporeExperimentSpec extends Specification { // latest example with slightly different structure stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(folder+"valid-example-v3.json") extendedDataStructureWithReportsFolderV3 = (Map) new JsonSlurper().parse(stream) + // nanopore structure containing a second basecalling folder + stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(folder+"valid-example-v4-with-basecalling.json") + extendedDataStructureWithReportsFolderV4 = (Map) new JsonSlurper().parse(stream) // read in unclassified example stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(folder+"valid-example-unclassified.json") unclassifiedWorkingDataStructure = (Map) new JsonSlurper().parse(stream) @@ -126,6 +135,20 @@ class OxfordNanoporeExperimentSpec extends Specification { assert measurements[0].asicTemp == "32.631687" } + def "Create sample Oxford Nanopore experiment successfully for structure with second basecalling"() { + given: + final def example = extendedDataStructureWithReportsFolderV4 + + when: + final def experiment = OxfordNanoporeExperiment.create(example) + final def measurements = experiment.getMeasurements() + + then: + assert experiment.sampleCode == "QABCD001AB" + assert measurements.size() == 1 + assert measurements[0].asicTemp == "32.631687" + } + def "Create a simple pooled Oxford Nanopore experiment successfully"() { given: final def example = minimalWorkingPooledDataStructure diff --git a/src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLogSpec.groovy b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLogSpec.groovy new file mode 100644 index 0000000000..27b5722952 --- /dev/null +++ b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/GuppyBasecallLogSpec.groovy @@ -0,0 +1,37 @@ +package life.qbic.datamodel.datasets.datastructure.files.nanopore + +import spock.lang.Specification + +/** + * + * + */ +class GuppyBasecallLogSpec extends Specification { + + def "shall create a GuppyBasecallingLog instance"() { + given: + final name = "guppy_basecall_client_log-.log" + final relativePath = "root/basecalling/guppy_basecall_client_log-.log" + + when: + def dataObject = GuppyBasecallLog.create(name, relativePath) + + then: + assert dataObject instanceof GuppyBasecallLog + assert dataObject.relativePath == relativePath + assert dataObject.name == name + } + + def "name not matching schema shall throw IllegalArgumentException"() { + given: + final name = "guppy_basecall.log" + final relativePath = "root/basecalling/guppy_basecall.log" + + when: + def dataObject = GuppyBasecallLog.create(name, relativePath) + + then: + thrown(IllegalArgumentException) + } + +} diff --git a/src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLogSpec.groovy b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLogSpec.groovy new file mode 100644 index 0000000000..d80b92861c --- /dev/null +++ b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/files/nanopore/SequencingTelemetryLogSpec.groovy @@ -0,0 +1,37 @@ +package life.qbic.datamodel.datasets.datastructure.files.nanopore + +import spock.lang.Specification + +/** + * + * + */ +class SequencingTelemetryLogSpec extends Specification { + + def "shall create a SequencingTelemetryLog instance"() { + given: + final name = "sequencing_telemetry_.js" + final relativePath = "root/basecalling/sequencing_telemetry_.js" + + when: + def dataObject = SequencingTelemetryLog.create(name, relativePath) + + then: + assert dataObject instanceof SequencingTelemetryLog + assert dataObject.relativePath == relativePath + assert dataObject.name == name + } + + def "name not matching schema shall throw IllegalArgumentException"() { + given: + final name = "telemetry.log" + final relativePath = "root/basecalling/telemetry.log" + + when: + def dataObject = SequencingTelemetryLog.create(name, relativePath) + + then: + thrown(IllegalArgumentException) + } + +} diff --git a/src/test/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolderSpec.groovy b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolderSpec.groovy new file mode 100644 index 0000000000..074c073b28 --- /dev/null +++ b/src/test/groovy/life/qbic/datamodel/datasets/datastructure/folders/nanopore/BasecallingFolderSpec.groovy @@ -0,0 +1,43 @@ +package life.qbic.datamodel.datasets.datastructure.folders.nanopore + +import life.qbic.datamodel.datasets.datastructure.files.DataFile +import life.qbic.datamodel.datasets.datastructure.files.nanopore.GuppyBasecallLog +import spock.lang.Specification + +/** + * + */ +class BasecallingFolderSpec extends Specification { + + def "create basecalling folder"() { + given: + final def name = "basecalling" + final def relativePath = "root/basecalling" + final def children = [] + final def datafile = GuppyBasecallLog.create("guppy_basecall_client_log-.log", "root/basecalling/guppy_basecall_client_log-.log") + children.add(datafile) + + when: + final def dataFolder = BasecallingFolder.create(name, relativePath, children) + + then: + assert dataFolder.getChildren().get(0) instanceof DataFile + + } + + def "naming schema violation should raise an IllegalArgumentException"() { + given: + final def name = "basedcall" + final def relativePath = "root/basedcall" + final def children = [] + final def datafile = GuppyBasecallLog.create("guppy_basecall_client_log-.log", "root/basedcall/guppy_basecall_client_log-.log") + children.add(datafile) + + when: + final def dataFolder = BasecallingFolder.create(name, relativePath, children) + + then: + thrown(IllegalArgumentException) + + } +} diff --git a/src/test/resources/nanopore/valid-example-v4-with-basecalling.json b/src/test/resources/nanopore/valid-example-v4-with-basecalling.json new file mode 100644 index 0000000000..2471b09b2c --- /dev/null +++ b/src/test/resources/nanopore/valid-example-v4-with-basecalling.json @@ -0,0 +1,267 @@ +{ + "name": "QABCD001AB_E12A345a01_PAE12345", + "path": "./", + "children": [ + { + "name": "20200122_1217_1-A1-B1-PAE12345_1234567a", + "metadata": { + "adapter": "flongle", + "asic_temp": "32.631687", + "base_caller": "Guppy", + "base_caller_version": "3.2.8+bd67289", + "device_type": "promethion", + "flow_cell_id": "PAE26306", + "flow_cell_product_code": "FLO-PRO002", + "flow_cell_position": "2-A3-D3", + "hostname": "PCT0094", + "protocol": "sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109:True", + "started": "2020-02-11T15:52:10.465982+01:00" + }, + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a", + "children": [ + { + "name": "throughput_.csv", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/throughput_.csv", + "file_type": "csv" + }, + { + "name": "report_.md", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/report_.md", + "file_type": "md" + }, + { + "name": "final_summary_.txt", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/final_summary_.txt", + "file_type": "txt" + }, + { + "name": "fastq_pass", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_pass", + "children": [ + { + "name": "myfile3.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_pass/myfile3.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile2.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_pass/myfile2.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile4.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_pass/myfile4.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile5.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_pass/myfile5.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile1.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_pass/myfile1.fastq.gz", + "file_type": "fastq.gz" + } + ] + }, + { + "name": "fastq_fail", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_fail/", + "children": [ + { + "name": "myfile3.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_fail/myfile3.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile2.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_fail/myfile2.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile4.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_fail/myfile4.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile5.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_fail/myfile5.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fastq_fail/myfile.fastq.gz", + "file_type": "fastq.gz" + } + ] + }, + { + "name": "duty_time_.csv", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/duty_time_.csv", + "file_type": "csv" + }, + { + "name": "sequencing_summary_.txt", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/sequencing_summary_.txt", + "file_type": "txt" + }, + { + "name": "mux_scan_data.csv", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/mux_scan_data.csv", + "file_type": "csv" + }, + { + "name": "drift_correction_.csv", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/drift_correction_.csv", + "file_type": "csv" + }, + { + "name": "fast5_fail", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_fail/", + "children": [ + { + "name": "myfile2.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_fail/myfile2.fast5", + "file_type": "fast5" + }, + { + "name": "myfile4.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_fail/myfile4.fast5", + "file_type": "fast5" + }, + { + "name": "myfile3.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_fail/myfile3.fast5", + "file_type": "fast5" + }, + { + "name": "myfile5.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_fail/myfile5.fast5", + "file_type": "fast5" + }, + { + "name": "myfile.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_fail/myfile.fast5", + "file_type": "fast5" + } + ] + }, + { + "name": "fast5_pass", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_pass/", + "children": [ + { + "name": "myfile2.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_pass/myfile2.fast5", + "file_type": "fast5" + }, + { + "name": "myfile4.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_pass/myfile4.fast5", + "file_type": "fast5" + }, + { + "name": "myfile3.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_pass/myfile3.fast5", + "file_type": "fast5" + }, + { + "name": "myfile5.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_pass/myfile5.fast5", + "file_type": "fast5" + }, + { + "name": "myfile.fast5", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/fast5_pass/myfile.fast5", + "file_type": "fast5" + } + ] + }, + { + "name": "basecalling", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/", + "children": [ + { + "name": "sequencing_summary_.txt", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/sequencing_summary_.txt", + "file_type": "txt" + }, + { + "name": "sequencing_telemetry_.js", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/sequencing_telemetry_.js", + "file_type": "js" + }, + { + "name": "guppy_basecall_client_log-1234-56-78_90.log", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/guppy_basecall_client_log-1234-56-78_90.log", + "file_type": "log" + }, + { + "name": "fastq_pass", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/", + "children": [ + { + "name": "myfile3.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/myfile3.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile2.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/myfile2.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile4.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/myfile4.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile5.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/myfile5.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_pass/myfile.fastq.gz", + "file_type": "fastq.gz" + } + ] + }, + { + "name": "fastq_fail", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_fail/", + "children": [ + { + "name": "myfile3.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_fail/myfile3.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile2.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_fail/myfile2.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile4.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_fail/myfile4.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile5.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_fail/myfile5.fastq.gz", + "file_type": "fastq.gz" + }, + { + "name": "myfile.fastq.gz", + "path": "./20200122_1217_1-A1-B1-PAE12345_1234567a/basecalling/fastq_fail/myfile.fastq.gz", + "file_type": "fastq.gz" + } + ] + } + ] + } + ] + } + ] +}