Skip to content

Commit

Permalink
Merge pull request #171 from bigbio/dev
Browse files Browse the repository at this point in the history
skip ols internet check, use instead local parquet files
  • Loading branch information
ypriverol authored Aug 5, 2024
2 parents b58f9dd + b445e77 commit 528db8c
Show file tree
Hide file tree
Showing 12 changed files with 72 additions and 30 deletions.
17 changes: 9 additions & 8 deletions .github/workflows/conda-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,24 @@ jobs:
source $CONDA/etc/profile.d/conda.sh
conda activate sdrf-pipelines
conda build recipe
PACKAGE_PATH=$(find /usr/share/miniconda/envs/sdrf-pipelines/conda-bld/noarch/ -name "sdrf-pipelines-*.tar.bz2" | head -n 1)
conda install --offline "$PACKAGE_PATH"
shell: bash -l {0}

- name: Install the built package
- name: Test the installed package
run: |
source $CONDA/etc/profile.d/conda.sh
conda activate sdrf-pipelines
conda install --use-local sdrf-pipelines
parse_sdrf --help
shell: bash -l {0}

- name: Test the installed package
- name: Test validation of SDRF file
run: |
conda activate sdrf-pipelines
parse_sdrf --help
parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv
shell: bash -l {0}

- name: Test other commands
- name: Test validation of SDRF file with cache only
run: |
conda activate sdrf-pipelines
parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --check_ms
shell: bash -l {0}
parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --use_ols_cache_only
shell: bash -l {0}
4 changes: 3 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ include README.md
include LICENSE

# Include the data files
recursive-include sdrf_pipelines *.xml *.yml
recursive-include sdrf_pipelines *.xml *.yml *.parquet
recursive-include sdrf_pipelines *.parquet
include sdrf_pipelines/ols/*.parquet
2 changes: 1 addition & 1 deletion recipe/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# recipe/meta.yaml
package:
name: sdrf-pipelines
version: "0.0.27"
version: "0.0.29"

source:
path: ../
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
pandas
click

# https://github.com/multimeric/PandasSchema does not seem to be maintained
# anymore, and right now it thows a pandas deprecation warning, in the
# future it will stop working.
Expand Down
2 changes: 1 addition & 1 deletion sdrf_pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.28"
__version__ = "0.0.29"
9 changes: 6 additions & 3 deletions sdrf_pipelines/ols/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,16 +275,19 @@ def get_ancestors(self, ont, iri):
logger.warning("Term was found but ancestor lookup returned an empty response: %s", response.json())
raise ex

def search(self, term: str, ontology: str, exact=True, **kwargs):
def search(self, term: str, ontology: str = None, exact=True, use_ols_cache_only: bool = False, **kwargs):
"""
Search a term in the OLS
@:param term: The name of the term
@:param ontology: The name of the ontology
@:param exact: Forces exact match if not `None`
"""
terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs)
if terms is None and self.use_cache:
if use_ols_cache_only:
terms = self.cache_search(term, ontology)
else:
terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs)
if terms is None and self.use_cache:
terms = self.cache_search(term, ontology)
return terms

def _perform_ols_search(self, params, name, exact, retry_num=0):
Expand Down
Binary file added sdrf_pipelines/ols/unimod.parquet
Binary file not shown.
9 changes: 7 additions & 2 deletions sdrf_pipelines/parse_sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def maxquant_from_sdrf(
@click.option(
"--skip_experimental_design_validation", help="Disable the validation of experimental design", is_flag=True
)
@click.option(
"--use_ols_cache_only", help="Use ols cache for validation of the terms and not OLS internet service", is_flag=True
)
@click.pass_context
def validate_sdrf(
ctx,
Expand All @@ -153,6 +156,7 @@ def validate_sdrf(
skip_ms_validation: bool,
skip_factor_validation: bool,
skip_experimental_design_validation: bool,
use_ols_cache_only: bool,
):
"""
Command to validate the SDRF file. The validation is based on the template provided by the user.
Expand All @@ -165,6 +169,7 @@ def validate_sdrf(
@param skip_ms_validation: flag to skip the validation of mass spectrometry fields
@param skip_factor_validation: flag to skip the validation of factor values
@param skip_experimental_design_validation: flag to skip the validation of experimental design
@param use_ols_cache_only: flag to use the OLS cache for validation of the terms and not OLS internet service
"""

if sdrf_file is None:
Expand All @@ -176,10 +181,10 @@ def validate_sdrf(
template = DEFAULT_TEMPLATE

df = SdrfDataFrame.parse(sdrf_file)
errors = df.validate(template)
errors = df.validate(template, use_ols_cache_only)

if not skip_ms_validation:
errors = errors + df.validate(MASS_SPECTROMETRY)
errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only)

if not skip_factor_validation:
errors = errors + df.validate_factor_values()
Expand Down
16 changes: 8 additions & 8 deletions sdrf_pipelines/sdrf/sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,27 +69,27 @@ def parse(sdrf_file: str):

return SdrfDataFrame(df)

def validate(self, template: str):
def validate(self, template: str, use_ols_cache_only: bool = False) -> List[LogicError]:
"""
Validate a corresponding SDRF
:return:
"""
errors = []
if template != MASS_SPECTROMETRY:
errors = default_schema.validate(self)
errors = default_schema.validate(self, use_ols_cache_only=use_ols_cache_only)

if template == HUMAN_TEMPLATE:
errors = errors + human_schema.validate(self)
errors = errors + human_schema.validate(self, use_ols_cache_only=use_ols_cache_only)
elif template == VERTEBRATES_TEMPLATE:
errors = errors + vertebrates_chema.validate(self)
errors = errors + vertebrates_chema.validate(self, use_ols_cache_only=use_ols_cache_only)
elif template == NON_VERTEBRATES_TEMPLATE:
errors = errors + nonvertebrates_chema.validate(self)
errors = errors + nonvertebrates_chema.validate(self, use_ols_cache_only=use_ols_cache_only)
elif template == PLANTS_TEMPLATE:
errors = errors + plants_chema.validate(self)
errors = errors + plants_chema.validate(self, use_ols_cache_only=use_ols_cache_only)
elif template == CELL_LINES_TEMPLATE:
errors = errors + cell_lines_schema.validate(self)
errors = errors + cell_lines_schema.validate(self, use_ols_cache_only=use_ols_cache_only)
elif template == MASS_SPECTROMETRY:
errors = mass_spectrometry_schema.validate(self)
errors = mass_spectrometry_schema.validate(self, use_ols_cache_only=use_ols_cache_only)

return errors

Expand Down
31 changes: 26 additions & 5 deletions sdrf_pipelines/sdrf/sdrf_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ def validate_optional(self, series):
warnings.append(w)
return warnings

def set_ols_strategy(self, use_ols_cache_only: bool = False):
for validation in self.validations:
if isinstance(validation, OntologyTerm):
validation.set_ols_strategy(use_ols_cache_only=use_ols_cache_only)


class OntologyTerm(_SeriesValidation):
"""
Expand All @@ -100,6 +105,7 @@ class OntologyTerm(_SeriesValidation):

def __init__(self, ontology_name: str = None, not_available: bool = False, not_applicable: bool = False, **kwargs):
super().__init__(**kwargs)
self._use_ols_cache_only = False
self._ontology_name = ontology_name
self._not_available = not_available
self._not_applicable = not_applicable
Expand Down Expand Up @@ -140,9 +146,16 @@ def validate(self, series: pd.Series) -> pd.Series:
ontology_terms = None
else:
if self._ontology_name is not None:
ontology_terms = client.search(term[TERM_NAME], ontology=self._ontology_name, exact="true")
ontology_terms = client.search(
term[TERM_NAME],
ontology=self._ontology_name,
exact="true",
use_ols_cache_only=self._use_ols_cache_only,
)
else:
ontology_terms = client.search(term[TERM_NAME], exact="true")
ontology_terms = client.search(
term=term[TERM_NAME], exact="true", use_cache_only=self._use_ols_cache_only
)

if ontology_terms is not None:
query_labels = [o["label"].lower() for o in ontology_terms]
Expand All @@ -154,6 +167,13 @@ def validate(self, series: pd.Series) -> pd.Series:
labels.append(NOT_APPLICABLE)
return series.apply(lambda cell_value: self.validate_ontology_terms(cell_value, labels))

def set_ols_strategy(self, use_ols_cache_only: bool = False):
"""
Set the strategy to use the OLS cache only
:param use_ols_cache_only: boolean
"""
self._use_ols_cache_only = use_ols_cache_only


class SDRFSchema(Schema):
_special_columns = {"sourcename", "assayname", "materialtype", "technologytype"}
Expand All @@ -168,7 +188,7 @@ def __new__(cls, ordered: bool = False, min_columns: int = 0) -> Any:
obj._min_columns = min_columns
return obj

def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
def validate(self, panda_sdrf: sdrf = None, use_ols_cache_only: bool = False) -> typing.List[LogicError]:
errors = []

# Check the minimum number of columns
Expand All @@ -195,7 +215,7 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
errors.extend(error_columns_order)

# Check that the term is present in ontology
error_ontology_terms = self.validate_columns(panda_sdrf)
error_ontology_terms = self.validate_columns(panda_sdrf, use_ols_cache_only=use_ols_cache_only)
if error_ontology_terms is not None:
for error in error_ontology_terms:
errors.append(error)
Expand Down Expand Up @@ -301,10 +321,11 @@ def _get_column_pairs(self, panda_sdrf):
column_pairs.append((panda_sdrf[column.name], column))
return column_pairs, errors

def validate_columns(self, panda_sdrf):
def validate_columns(self, panda_sdrf, use_ols_cache_only: bool = False):
# Iterate over each pair of schema columns and data frame series and run validations
column_pairs, errors = self._get_column_pairs(panda_sdrf)
for series, column in column_pairs:
column.set_ols_strategy(use_ols_cache_only=use_ols_cache_only)
errors += column.validate(series)
return sorted(errors, key=lambda e: e.row)

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def get_version(rel_path):
data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])],
package_data={
"sdrf-pipelines": ["*.xml", "*.parquet", "*.yml"],
"sdrf_pipelines": ["*.xml", "*.parquet", "*.yml"],
},
url="https://github.com/bigbio/sdrf-pipelines",
packages=find_packages(),
Expand Down
10 changes: 10 additions & 0 deletions tests/test_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ def test_ontology():
assert len(ontology_list) > 0


def test_ontology_cache():
ols = OlsClient()
ontology_list = ols.ols_search(
"homo sapiens",
ontology="NCBITaxon",
)
print(ontology_list)
assert len(ontology_list) > 0


def test_ontology_from_cache():
ols = OlsClient()
ontology_list = ols.cache_search("homo sapiens", ontology="NCBITaxon")
Expand Down

0 comments on commit 528db8c

Please sign in to comment.