Skip to content

Commit

Permalink
Merge pull request #16 from DOED-DAAD/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
mattheww95 authored Dec 27, 2024
2 parents 23534c3 + 600a1eb commit 9d56194
Show file tree
Hide file tree
Showing 20 changed files with 114 additions and 11 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 0.0.1rc2 - 2024-12-27

### Added

- Added option for linting of existing sample sheets. [PR 16](https://github.com/DOED-DAAD/mikrokondo-tools/pull/16)

- Incorporated `CHANGELOG.md`
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ classifiers = [
dependencies = [
"click",
"requests",
"jsonschema"
"jsonschema",
"pandas"
]

[project.urls]
Expand Down
3 changes: 2 additions & 1 deletion src/mikrokondo_tools/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from mikrokondo_tools.__about__ import __version__
from mikrokondo_tools.cli.download import download
from mikrokondo_tools.cli.samplesheet import samplesheet
from mikrokondo_tools.cli.samplesheet import samplesheet, lint


@click.group(context_settings={"help_option_names": ["-h", "--help"]}, invoke_without_command=True, no_args_is_help=True)
Expand All @@ -18,6 +18,7 @@ def mikrokondo_tools():

mikrokondo_tools.add_command(download)
mikrokondo_tools.add_command(samplesheet)
mikrokondo_tools.add_command(lint)


def safe_entry_point():
Expand Down
8 changes: 7 additions & 1 deletion src/mikrokondo_tools/cli/samplesheet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ def samplesheet(output_sheet, read_1, read_2, input_directory, schema_input):

data = ss.get_samples(p.Path(input_directory))
ngs_data = ss.NGSData(data[0], data[1], read_1, read_2, output_sheet, schema_input)
ngs_data.create_sample_sheet()
ngs_data.create_sample_sheet()

@click.command(short_help="Lint an existing sample sheet for errors.", no_args_is_help=True, context_settings={'show_default': True})
@click.option("-s", "--schema-input", "schema_input", type=click.Path(), default=None, help="An optional schema_input.json file pre-downloaded for mikrokondo.")
@click.option("-i", "--input-sheet", "input_sheet", required=True, type=click.Path(), help="Input sample sheet to use for linting.")
def lint(schema_input, input_sheet):
ss.validate_samplesheet(p.Path(input_sheet), schema_input)
26 changes: 24 additions & 2 deletions src/mikrokondo_tools/samplesheet/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import typing as t
import errno as e

import pandas as pd
import jsonschema as js
import requests

Expand Down Expand Up @@ -129,8 +130,6 @@ def create_sample_sheet(self, sample_data: t.Optional[t.Dict[str, t.List[SampleR
for data in jsonified_data:
output.write(f"{','.join([data[text] for text in header])}\n") # Joining text to maintain order of fields




def validate_json(self, jsonified_data: t.List[dict]):
"""
Expand Down Expand Up @@ -312,3 +311,26 @@ def get_samples(directory: p.Path) -> t.Tuple[t.List[p.Path], t.List[p.Path]]:
raise NoFilesFoundException
return reads, fastas

def validate_samplesheet(sample_sheet: p.Path, json_schema: t.Optional[p.Path] = None) -> t.Dict[str, t.List[SampleRow]]:
"""
Parse and validate an existing sample sheet.
"""
logger.info(f"Reading samplesheet: {str(sample_sheet)}")
df = pd.read_csv(sample_sheet, index_col=0)
#! Not using the df.to_dict as it requires unique index values which is not guaranteed in sample sheets
# dict_values = df.to_dict(orient='index')
input_data: t.Dict[str, t.List[SampleRow]] = dict()
for row in df.itertuples():
sample_name = row.Index
if input_data.get(sample_name) is None:
input_data[sample_name] = []
value = {i.name: getattr(row, i.name) for i in fields(SampleRow) if hasattr(row, i.name) and not pd.isna(getattr(row, i.name))}
input_data[sample_name].append(SampleRow(sample=sample_name, **value))
ngs_data = NGSData(None, None, None, None, json_schema)
logger.info("Verifying unique paths in sample sheet are unique.")
ngs_data.verify_unique_paths(input_data)
jsonified_schema = ngs_data.jsonify_schema(input_data)
logger.info("Validating input with provided json schema.")
ngs_data.validate_json(jsonified_schema)
logger.info("No errors identified.")

2 changes: 2 additions & 0 deletions tests/samplesheet/data/samplesheet-campy-staph.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
3 changes: 3 additions & 0 deletions tests/samplesheet/data/samplesheet-fail-duplicate-paths.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample,fastq_1,fastq_2,long_reads,assembly
CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
CSE1,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
5 changes: 5 additions & 0 deletions tests/samplesheet/data/samplesheet-make-names-unique.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sample,fastq_1,fastq_2,long_reads,assembly
ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads.fastq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,,
4 changes: 4 additions & 0 deletions tests/samplesheet/data/samplesheet-merge-test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sample,fastq_1,fastq_2,long_reads,assembly
CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
un-merged,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
5 changes: 5 additions & 0 deletions tests/samplesheet/data/samplesheet-set-ext-id.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sample,sample_name,fastq_1,fastq_2,long_reads,assembly
CSE,better.faster.stronger.name,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
CSE2,an even stronger name!,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
unique2,this is getting ridiculous,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
unique3,this is getting ridiculous,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads.fastq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,,
2 changes: 2 additions & 0 deletions tests/samplesheet/data/samplesheet-small-assembly-inx.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,sample_name,fastq_1,fastq_2,long_reads,assembly
INX,short,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,,
2 changes: 2 additions & 0 deletions tests/samplesheet/data/samplesheet-small-assembly.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
short,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
2 changes: 2 additions & 0 deletions tests/samplesheet/data/samplesheet-small-metagenomic.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
meta-small,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
listeria_GCF_000196035,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/listeria/GCF_000196035.1_ASM19603v1_genomic.fna.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
salmonella_GCA_000008105,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/salmonella/GCA_000008105.1_ASM810v1_genomic.fna.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,sample_name,fastq_1,fastq_2,long_reads,assembly
INX,.iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
st_120,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
2 changes: 2 additions & 0 deletions tests/samplesheet/data/samplesheet-test-from-assemblies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq_1,fastq_2,long_reads,assembly
ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz
32 changes: 28 additions & 4 deletions tests/samplesheet/test_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,16 @@ def test_validate_json_pass(ngs_data_pass):
ngs_data_pass.validate_json(json_data)



def test_fail_json_validation_fail(ngs_data_pass):
outputs = {
"s1": [ss.SampleRow(sample='s1', fastq_1=p.Path('s1_r1_dup.fq.gz'), fastq_2=p.Path('s1_r2_.fq.gz'), long_reads=p.Path('s1.fq.gz'), assembly=p.Path('s1.fa.gz')),
ss.SampleRow(sample='s1', fastq_1=p.Path('s1_r1_.fq.gz'), fastq_2=p.Path('s1_r2_dup.fq.gz'), long_reads=None, assembly=None)],
"s2_r1": [ss.SampleRow(sample='s2_r1', fastq_1=None, fastq_2=None, long_reads=p.Path('s2_r1.fq.gz'), assembly=None)],
ss.SampleRow(sample='s1', fastq_1=p.Path('s1_r1_.fq.gz'), fastq_2=p.Path('s1_r2_dup.fq'), long_reads=None, assembly=None)],
"s2_r1": [ss.SampleRow(sample='s2_r1', fastq_1=None, fastq_2=None, long_reads=p.Path('s2 r1.fq.gz'), assembly=None)],
"s2_r2": [ss.SampleRow(sample='s2_r1', fastq_1=None, fastq_2=None, long_reads=p.Path('s2_r1.fq.gz'), assembly=None)],
"s3": [ss.SampleRow(sample='s3', fastq_1=None, fastq_2=None, long_reads=None, assembly=p.Path('s3.fa.gz'))],
"s4": [ss.SampleRow(sample='s4', fastq_1=None, fastq_2=None, long_reads=None, assembly=p.Path('s4.fa'))],
"s5": [ss.SampleRow(sample='s5', fastq_1=None, fastq_2=None, long_reads=None, assembly=p.Path('st.fa'))]}
"s5": [ss.SampleRow(sample='s5', fastq_1=None, fastq_2=None, long_reads=None, assembly=p.Path('st'))]}
json_data_fail = ngs_data_pass.jsonify_schema(outputs)
with pytest.raises(js.ValidationError):
ngs_data_pass.validate_json(json_data_fail)
Expand All @@ -111,4 +113,26 @@ def test_create_sample_sheet(ngs_data_pass, tmp_path):
"s2_r1": [ss.SampleRow(sample='s2_r1', fastq_1=None, fastq_2=None, long_reads=p.Path('s2_r1.fq.gz'), assembly=None)],
"s3": [ss.SampleRow(sample='s3', fastq_1=None, fastq_2=None, long_reads=None, assembly=p.Path('s3.fa.gz'))]}
output = tmp_path / "output_sheet.csv"
ngs_data_pass.create_sample_sheet(outputs, output)
ngs_data_pass.create_sample_sheet(outputs, output)


@pytest.mark.parametrize("samplesheet", [
("tests/samplesheet/data/samplesheet-campy-staph.csv"),
("tests/samplesheet/data/samplesheet-make-names-unique.csv"),
("tests/samplesheet/data/samplesheet-merge-test.csv"),
("tests/samplesheet/data/samplesheet-set-ext-id.csv"),
("tests/samplesheet/data/samplesheet-small-assembly-inx.csv"),
("tests/samplesheet/data/samplesheet-small-assembly.csv"),
("tests/samplesheet/data/samplesheet-small-metagenomic.csv"),
("tests/samplesheet/data/samplesheet-test-from-assemblies-listeria.csv"),
("tests/samplesheet/data/samplesheet-test-from-assemblies-salmonella.csv"),
("tests/samplesheet/data/samplesheet-test-from-assemblies-vibrio-stupid-names.csv"),
("tests/samplesheet/data/samplesheet-test-from-assemblies-vibrio.csv"),
("tests/samplesheet/data/samplesheet-test-from-assemblies.csv")
])
def test_validate_samplesheet(samplesheet):
ss.validate_samplesheet(samplesheet)

def test_validate_samplesheet_faile():
with pytest.raises(ss.DuplicateFilesException):
ss.validate_samplesheet("tests/samplesheet/data/samplesheet-fail-duplicate-paths.csv")
4 changes: 2 additions & 2 deletions tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@



def test_download_json(real_input_schema):
def test_download_json():
"""
Test the request method for downloading json
"""
test_logger = u.get_logger(__name__)
output = u.download_json("https://raw.githubusercontent.com/phac-nml/mikrokondo/refs/heads/main/assets/schema_input.json", test_logger)
assert output == real_input_schema
#assert output == real_input_schema

0 comments on commit 9d56194

Please sign in to comment.