Skip to content

Commit

Permalink
Merge pull request #169 from bigbio/dev
Browse files Browse the repository at this point in the history
Big PR with multiple validations - [DO NOT MERGE YET]
  • Loading branch information
ypriverol authored Jul 30, 2024
2 parents 633da2c + 9d88135 commit b58f9dd
Show file tree
Hide file tree
Showing 8 changed files with 1,640 additions and 1,355 deletions.
2 changes: 1 addition & 1 deletion sdrf_pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.27"
__version__ = "0.0.28"
39 changes: 36 additions & 3 deletions sdrf_pipelines/parse_sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,23 +137,56 @@ def maxquant_from_sdrf(
required=False,
)
@click.option(
"--check_ms", help="check mass spectrometry fields in SDRF (e.g. postranslational modifications)", is_flag=True
"--skip_ms_validation",
help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)",
is_flag=True,
)
@click.option("--skip_factor_validation", help="Disable the validation of factor values in SDRF", is_flag=True)
@click.option(
"--skip_experimental_design_validation", help="Disable the validation of experimental design", is_flag=True
)
@click.pass_context
def validate_sdrf(ctx, sdrf_file: str, template: str, check_ms):
def validate_sdrf(
ctx,
sdrf_file: str,
template: str,
skip_ms_validation: bool,
skip_factor_validation: bool,
skip_experimental_design_validation: bool,
):
"""
Command to validate the SDRF file. The validation is based on the template provided by the user.
User can select the template to be used for validation. If no template is provided, the default template will be used.
Additionally, the mass spectrometry fields and factor values can be validated separately. However, if
the mass spectrometry validation or factor value validation is skipped, the user will be warned about it.
@param sdrf_file: SDRF file to be validated
@param template: template to be used for a validation
@param skip_ms_validation: flag to skip the validation of mass spectrometry fields
@param skip_factor_validation: flag to skip the validation of factor values
@param skip_experimental_design_validation: flag to skip the validation of experimental design
"""

if sdrf_file is None:
msg = "The config file for the pipeline is missing, please provide one "
logging.error(msg)
raise AppConfigException(msg)

if template is None:
template = DEFAULT_TEMPLATE

df = SdrfDataFrame.parse(sdrf_file)
errors = df.validate(template)

if check_ms:
if not skip_ms_validation:
errors = errors + df.validate(MASS_SPECTROMETRY)

if not skip_factor_validation:
errors = errors + df.validate_factor_values()

if not skip_experimental_design_validation:
errors = errors + df.validate_experimental_design()

for error in errors:
print(error)

Expand Down
210 changes: 208 additions & 2 deletions sdrf_pipelines/sdrf/sdrf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
from typing import List

import pandas as pd

Expand All @@ -17,20 +18,34 @@
from sdrf_pipelines.sdrf.sdrf_schema import nonvertebrates_chema
from sdrf_pipelines.sdrf.sdrf_schema import plants_chema
from sdrf_pipelines.sdrf.sdrf_schema import vertebrates_chema
from sdrf_pipelines.utils.exceptions import LogicError


def check_if_integer(x):
"""
Check if value x from panda cell can be converted to an integer.
:param x: value to check
:return: True if x can be converted to an integer, False otherwise
"""
try:
int(x)
return True
except ValueError:
return False


class SdrfDataFrame(pd.DataFrame):
@property
def _constructor(self):
"""
This method is makes it so our methods return an instance
This method is making it so our methods return an instance
:return:
"""
return SdrfDataFrame

def get_sdrf_columns(self):
"""
This method return the name of the columns of the SDRF.
This method returns the name of the columns of the SDRF.
:return:
"""
return self.columns
Expand Down Expand Up @@ -77,3 +92,194 @@ def validate(self, template: str):
errors = mass_spectrometry_schema.validate(self)

return errors

def validate_factor_values(self) -> List[LogicError]:
"""
Validate that factor values are present in the SDRF columns.
:return: A list of LogicError objects if any factor value columns are missing, otherwise an empty list.
"""
errors = []
# Check if any column starts with 'factor value' (case-insensitive)
fv_values = [col for col in self.columns if col.lower().startswith("factor value")]

if len(fv_values) == 0:
error_message = f"No factor values present in the following SDRF columns: {self.columns}"
errors.append(LogicError(error_message, error_type=logging.ERROR))

# find the corresponding columns for the factor values
fv_dc = {}
for fv in fv_values:
factor = fv.lower().replace("factor value[", "").replace("]", "")
cols = [col for col in self.columns if (factor in col.lower() and "factor value" not in col.lower())]
if len(cols) == 0:
error_message = f"Make sure your SDRF have a sample characteristics or data comment '{factor}' for your factor value column '{fv}'"
errors.append(LogicError(error_message, error_type=logging.ERROR))
elif len(cols) > 1:
error_message = f"Multiple columns found for factor '{factor}': {cols}"
errors.append(LogicError(error_message, error_type=logging.ERROR))
else:
fv_dc[fv] = cols[0]

for factor, col in fv_dc.items():
equals_cols = self[factor].equals(self[col])
if not equals_cols:
# if factor value contains different values from corresponding columns, print the values
different_values = self[factor][self[factor] != self[col]]
different_values = different_values.index.tolist()
error_message = f"Factor '{factor}' and column '{col}' do not have the same values for the following rows: {different_values}"
errors.append(LogicError(error_message, error_type=logging.ERROR))

return errors

def validate_experimental_design(self) -> List[LogicError]:
"""
Validate that the experimental design is correct. This method checks that the experimental design is correct,
including the following:
- A raw file can only have one associated assay name. If a raw file has more than one assay name, an error is
raised.
:return: A list of LogicError objects if the experimental design is incorrect, otherwise an empty list.
"""

errors = []

# Check that combination of value assay name and characteristics[data file] is unique in self
errors = self.check_inconsistencies_assay_file(errors)

errors = self.check_unique_sample_file_combinations(errors)

errors = self.check_accessions_conventions(errors)

return errors

def check_inconsistencies_assay_file(self, errors: List[LogicError]) -> List[LogicError]:
"""
Check that combination of values assay name and comment[data file] is unique in self.
:return: A list of LogicError objects if the combination of values assay name and characteristics[data file] is
not unique, otherwise an empty list.
"""

# Group by col1 and check if each group has only one unique col2 value
col1_inconsistencies = self.groupby("assay name")["comment[data file]"].nunique()
col1_inconsistent_groups = col1_inconsistencies[col1_inconsistencies > 1]
if len(col1_inconsistent_groups) > 0:
cell_index = col1_inconsistent_groups.index.tolist()
error_message = f"Multiple assays with the same raw files: {cell_index}, the combination assay name and comment[data file] should be unique"
errors.append(LogicError(error_message, error_type=logging.ERROR))

# Group by col2 and check if each group has only one unique col1 value
col2_inconsistencies = self.groupby("comment[data file]")["assay name"].nunique()
col2_inconsistent_groups = col2_inconsistencies[col2_inconsistencies > 1]
if len(col2_inconsistent_groups) > 0:
cell_index = col2_inconsistent_groups.index.tolist()
error_message = f"Multiple raw files with the same assay: {cell_index}, the combination assay name and comment[data file] should be unique"
errors.append(LogicError(error_message, error_type=logging.ERROR))

return errors

def check_unique_sample_file_combinations(self, errors: List[LogicError]) -> List[LogicError]:
"""
The combination of the following columns should be unique:
- source name
- comment[technical replicate]
- comment[biological replicate]
- comment[label]
- comment[fraction identifier]
:return: A list of LogicError objects if the source names are not unique, otherwise an empty list.
"""
cols = [
"source name",
"comment[technical replicate]",
"characteristics[biological replicate]",
"comment[label]",
"comment[fraction identifier]",
]

for col in cols:
if col not in self.columns:
error_message = (
f"In order to perform experimental design validation, column '{col}' must be present in the SDRF"
)
errors.append(LogicError(error_message, error_type=logging.ERROR))

colum_present = all(col in self.columns for col in cols)
if not colum_present:
return errors

duplicates = self.duplicated(subset=cols, keep=False)
if duplicates.any():
error_message = f"Duplicate samples found in the SDRF for the combinations of the following columns: {cols}"
errors.append(LogicError(error_message, error_type=logging.ERROR))

return errors

def check_accessions_conventions(self, errors):
"""
Check that the accessions in the SDRF follow the conventions for the different templates.
:return: A list of LogicError objects if the accessions do not follow the conventions, otherwise an empty list.
"""
errors = []

def check_integer_columns(df, columns):
"""
This method checks that all the values in the given columns are integers. Retrieve a dictionary with the
columns as keys and the list of row indexes that do not contain integer as values.
:param df: The dataframe to check
:param columns: The columns to check
:return: A dataframe containing the rows that do not contain only integers in the specified columns
"""

non_integer_rows = {}
for column in columns:
# Check if the column contains only integers
non_integers = df[~df[column].apply(check_if_integer)].index.tolist()
if non_integers:
non_integer_rows[column] = non_integers
return non_integer_rows

# Specify the columns to check
columns_to_check = [
"comment[technical replicate]",
"characteristics[biological replicate]",
"comment[fraction identifier]",
]

## Remove columns that are not present in the dataframe
columns_to_check = [col for col in columns_to_check if col in self.columns]

# Find rows that do not contain only integers in the specified columns
non_integer_rows = check_integer_columns(self, columns_to_check)

if len(non_integer_rows) > 0:
errors.append(
LogicError(
f"Non-integer values found in the following columns and rows: {non_integer_rows}",
error_type=logging.WARNING,
)
)

def check_all_integers_higher_than_one(df, columns):
"""
This method check that all the values in the columns (if they are numbers) are higher than 0.
:param df: The dataframe to check
:param columns: The columns to check
:return: A dataframe containing the rows that do not contain only integers in the specified columns
"""
non_integer_rows = {}
for column in columns:
# Check if the column contains only integers
non_integers = df[~df[column].apply(lambda x: check_if_integer(x) and int(x) > 0)].index.tolist()
if non_integers:
non_integer_rows[column] = non_integers
return non_integer_rows

lower_than_one = check_all_integers_higher_than_one(self, columns_to_check)
if len(lower_than_one) > 0:
errors.append(
LogicError(
f"Values lower than 1 found in the following columns and rows: {lower_than_one}",
error_type=logging.WARNING,
)
)

return errors
Loading

0 comments on commit b58f9dd

Please sign in to comment.