Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Group and disable nanopore schema validation #139

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions src/main/groovy/life/qbic/utils/NanoporeParser.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputDoradoMinim
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputMinimal
import net.jimblackler.jsonschemafriend.Schema
import net.jimblackler.jsonschemafriend.SchemaStore
import net.jimblackler.jsonschemafriend.ValidationError
import net.jimblackler.jsonschemafriend.ValidationException
import net.jimblackler.jsonschemafriend.Validator

Expand All @@ -32,7 +33,10 @@ class NanoporeParser {

String json = mapToJson(convertedDirectory)
// Step2: Validate created Json against schema
validateJson(json)

/*Schema Validation has been deprecated since the nanopore schema changes too much to be handled */
//validateJson(json)

//Step3: convert valid json to OxfordNanoporeExperiment Object
// Step4: Parse meta data out of report files and extend the map
def finalMap = parseMetaData(convertedDirectory, directory)
Expand Down Expand Up @@ -181,14 +185,20 @@ class NanoporeParser {

SchemaStore schemaStore = new SchemaStore()
Validator validator = new Validator()
try {
//Validate against Fast5 Based Oxford Measurement
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject)
} catch (ValidationException ignored) {
//Validate against Pod5 Based Oxford Measurement
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputDoradoMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject)
GroupedValidationErrorException groupedValidationException = new GroupedValidationErrorException()
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject, fast5ValidationError -> {
groupedValidationException.addValidationErrorMessage(fast5ValidationError)
})
schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputDoradoMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject, pod5ValidationError -> {
groupedValidationException.addValidationErrorMessage(pod5ValidationError)
})
if (groupedValidationException.getValidationExceptionErrorMessages().size() == 2) {
groupedValidationException.getValidationExceptionErrorMessages().forEach { validationError ->
log.debug("Nanopore validation failed for " + validationError.toString())
}
throw groupedValidationException
}
}

Expand Down Expand Up @@ -331,6 +341,25 @@ class NanoporeParser {
}
return fileType
}
}

static class GroupedValidationErrorException extends ValidationException {

private final ArrayList<ValidationError> validationErrors = new ArrayList()

GroupedValidationErrorException(ValidationError... validationErrors) {
for (final validationError in validationErrors) {
this.validationErrors.add(validationError)
}
}

ArrayList<ValidationError> getValidationExceptionErrorMessages() {
return validationErrors
}

void addValidationErrorMessage(ValidationError validationError) {
validationExceptionErrorMessages.add(validationError)
}
}

}
34 changes: 31 additions & 3 deletions src/test/groovy/life/qbic/utils/NanoporeParserSpec.groovy
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package life.qbic.utils

import life.qbic.datamodel.datasets.OxfordNanoporeExperiment
import net.jimblackler.jsonschemafriend.ValidationException
import spock.lang.Specification

import java.nio.file.NotDirectoryException
Expand Down Expand Up @@ -115,6 +114,7 @@ class NanoporeParserSpec extends Specification {
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}
/* Schema Validation has been deprecated since the nanopore schema changes too much to be handled

def "parsing an invalid minimal file structure leads to a ValidationException"() {
given:
Expand All @@ -124,6 +124,7 @@ class NanoporeParserSpec extends Specification {
then:
thrown(ValidationException)
}
*/

def "parsing a valid minimal file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
given:
Expand All @@ -138,6 +139,32 @@ class NanoporeParserSpec extends Specification {
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing a valid minimal file structure with bam files and dorado basecalling returns an OxfordNanoporeExperiment"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_minimal_bam")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing a valid minimal file structure with pod5 files and dorado basecalling returns an OxfordNanoporeExperiment"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_minimal_pod5")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing a valid file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_example")
Expand All @@ -146,11 +173,12 @@ class NanoporeParserSpec extends Specification {
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
//assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

/*Schema Validation has been deprecated since the nanopore schema changes too much to be handled
def "parsing an invalid minimal file structure for dorado based basecalling leads to a ValidationException"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "fails/QABCD001AB_E12A345a01_PAE12345_missing_skip_folder")
Expand All @@ -159,7 +187,7 @@ class NanoporeParserSpec extends Specification {
then:
thrown(ValidationException)
}

*/
def "parsing the alternative valid file structure with metadata missing returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_new_minimal")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
instrument=PCT0094
position=1-A3-D3
flow_cell_id=PAE24142
sample_id=QNANO027AE_E19D023a01_200211
protocol_group_id=20200211_QNANO
protocol=sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109-XL
protocol_run_id=5a7cfc2a-81b0-412d-baa0-51b939cd8e76
acquisition_run_id=c6028297dff19d01e7c5fba6487de807d1e99c05
started=2020-02-11T15:52:10.465982+01:00
acquisition_stopped=2020-02-14T08:39:54.688916+01:00
processing_stopped=2020-02-14T08:39:58.804639+01:00
basecalling_enabled=1
sequencing_summary_file=sequencing_summary_PAE24142_c6028297.txt
fast5_files_in_final_dest=2189
fast5_files_in_fallback=0
fastq_files_in_final_dest=2189
fastq_files_in_fallback=0
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
}

Duty Time
=========

ID: db9e9383d44d80bbe1e2600c7a7419056610d46d

Channel State,Experiment Time (minutes),State Time (samples),
strand,0,144832342
strand,1,158421270
strand,2,378095352
strand,3,472685319
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
instrument=PCT0094
position=1-A3-D3
flow_cell_id=PAE24142
sample_id=QNANO027AE_E19D023a01_200211
protocol_group_id=20200211_QNANO
protocol=sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109-XL
protocol_run_id=5a7cfc2a-81b0-412d-baa0-51b939cd8e76
acquisition_run_id=c6028297dff19d01e7c5fba6487de807d1e99c05
started=2020-02-11T15:52:10.465982+01:00
acquisition_stopped=2020-02-14T08:39:54.688916+01:00
processing_stopped=2020-02-14T08:39:58.804639+01:00
basecalling_enabled=1
sequencing_summary_file=sequencing_summary_PAE24142_c6028297.txt
fast5_files_in_final_dest=2189
fast5_files_in_fallback=0
fastq_files_in_final_dest=2189
fastq_files_in_fallback=0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
}

Duty Time
=========

ID: db9e9383d44d80bbe1e2600c7a7419056610d46d

Channel State,Experiment Time (minutes),State Time (samples),
strand,0,144832342
strand,1,158421270
strand,2,378095352
strand,3,472685319
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,25 @@ Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
"asic_temp": "12.34567890",
"device_id": "MN17776",
"device_type": "minion",
"distribution_status": "stable",
"distribution_version": "23.07.12",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"flow_cell_id": "FAV04482",
"flow_cell_product_code": "FLO-MIN114",
"guppy_version": "7.1.4",
"host_product_code": "unknown",
"host_product_serial_number": "",
"hostname": "supermicro02",
"installation_type": "nc",
"operating_system": "ubuntu 18.04",
"protocol_group_id": "2307-Voolstra-Metagen-Pilot",
"protocol_run_id": "",
"protocol_start_time": "",
"sample_id": "Pool1"
}

Duty Time
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Loading