diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml index 1d929ee..1e49844 100644 --- a/.github/workflows/conda-build.yml +++ b/.github/workflows/conda-build.yml @@ -38,23 +38,24 @@ jobs: source $CONDA/etc/profile.d/conda.sh conda activate sdrf-pipelines conda build recipe + PACKAGE_PATH=$(find /usr/share/miniconda/envs/sdrf-pipelines/conda-bld/noarch/ -name "sdrf-pipelines-*.tar.bz2" | head -n 1) + conda install --offline "$PACKAGE_PATH" shell: bash -l {0} - - name: Install the built package + - name: Test the installed package run: | - source $CONDA/etc/profile.d/conda.sh conda activate sdrf-pipelines - conda install --use-local sdrf-pipelines + parse_sdrf --help shell: bash -l {0} - - name: Test the installed package + - name: Test validation of SDRF file run: | conda activate sdrf-pipelines - parse_sdrf --help + parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv shell: bash -l {0} - - name: Test other commands + - name: Test validation of SDRF file with cache only run: | conda activate sdrf-pipelines - parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --check_ms - shell: bash -l {0} + parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --use_ols_cache_only + shell: bash -l {0} \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index e99df53..50a60dc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,4 +5,6 @@ include README.md include LICENSE # Include the data files -recursive-include sdrf_pipelines *.xml *.yml +recursive-include sdrf_pipelines *.xml *.yml *.parquet +recursive-include sdrf_pipelines *.parquet +include sdrf_pipelines/ols/*.parquet \ No newline at end of file diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 3f2827c..4d46474 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,7 +1,7 @@ # recipe/meta.yaml package: name: sdrf-pipelines - version: "0.0.27" + version: "0.0.29" source: path: ../ diff --git a/requirements.txt b/requirements.txt index e503651..ca30a5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ pandas click - # https://github.com/multimeric/PandasSchema does not seem to be maintained # anymore, and right now it thows a pandas deprecation warning, in the # future it will stop working. diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index 4d04613..a3024cf 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.28" +__version__ = "0.0.29" diff --git a/sdrf_pipelines/ols/ols.py b/sdrf_pipelines/ols/ols.py index c7666f8..ebfc4c6 100644 --- a/sdrf_pipelines/ols/ols.py +++ b/sdrf_pipelines/ols/ols.py @@ -275,16 +275,19 @@ def get_ancestors(self, ont, iri): logger.warning("Term was found but ancestor lookup returned an empty response: %s", response.json()) raise ex - def search(self, term: str, ontology: str, exact=True, **kwargs): + def search(self, term: str, ontology: str = None, exact=True, use_ols_cache_only: bool = False, **kwargs): """ Search a term in the OLS @:param term: The name of the term @:param ontology: The name of the ontology @:param exact: Forces exact match if not `None` """ - terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs) - if terms is None and self.use_cache: + if use_ols_cache_only: terms = self.cache_search(term, ontology) + else: + terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs) + if terms is None and self.use_cache: + terms = self.cache_search(term, ontology) return terms def _perform_ols_search(self, params, name, exact, retry_num=0): diff --git a/sdrf_pipelines/ols/unimod.parquet b/sdrf_pipelines/ols/unimod.parquet new file mode 100644 index 0000000..ed8c630 Binary files /dev/null and b/sdrf_pipelines/ols/unimod.parquet differ diff --git a/sdrf_pipelines/parse_sdrf.py b/sdrf_pipelines/parse_sdrf.py index 6ffb7d2..9de5e39 100755 --- a/sdrf_pipelines/parse_sdrf.py +++ b/sdrf_pipelines/parse_sdrf.py @@ -145,6 +145,9 @@ def maxquant_from_sdrf( @click.option( "--skip_experimental_design_validation", help="Disable the validation of experimental design", is_flag=True ) +@click.option( + "--use_ols_cache_only", help="Use ols cache for validation of the terms and not OLS internet service", is_flag=True +) @click.pass_context def validate_sdrf( ctx, @@ -153,6 +156,7 @@ def validate_sdrf( skip_ms_validation: bool, skip_factor_validation: bool, skip_experimental_design_validation: bool, + use_ols_cache_only: bool, ): """ Command to validate the SDRF file. The validation is based on the template provided by the user. @@ -165,6 +169,7 @@ def validate_sdrf( @param skip_ms_validation: flag to skip the validation of mass spectrometry fields @param skip_factor_validation: flag to skip the validation of factor values @param skip_experimental_design_validation: flag to skip the validation of experimental design + @param use_ols_cache_only: flag to use the OLS cache for validation of the terms and not OLS internet service """ if sdrf_file is None: @@ -176,10 +181,10 @@ def validate_sdrf( template = DEFAULT_TEMPLATE df = SdrfDataFrame.parse(sdrf_file) - errors = df.validate(template) + errors = df.validate(template, use_ols_cache_only) if not skip_ms_validation: - errors = errors + df.validate(MASS_SPECTROMETRY) + errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only) if not skip_factor_validation: errors = errors + df.validate_factor_values() diff --git a/sdrf_pipelines/sdrf/sdrf.py b/sdrf_pipelines/sdrf/sdrf.py index 958aaef..9e446cc 100644 --- a/sdrf_pipelines/sdrf/sdrf.py +++ b/sdrf_pipelines/sdrf/sdrf.py @@ -69,27 +69,27 @@ def parse(sdrf_file: str): return SdrfDataFrame(df) - def validate(self, template: str): + def validate(self, template: str, use_ols_cache_only: bool = False) -> List[LogicError]: """ Validate a corresponding SDRF :return: """ errors = [] if template != MASS_SPECTROMETRY: - errors = default_schema.validate(self) + errors = default_schema.validate(self, use_ols_cache_only=use_ols_cache_only) if template == HUMAN_TEMPLATE: - errors = errors + human_schema.validate(self) + errors = errors + human_schema.validate(self, use_ols_cache_only=use_ols_cache_only) elif template == VERTEBRATES_TEMPLATE: - errors = errors + vertebrates_chema.validate(self) + errors = errors + vertebrates_chema.validate(self, use_ols_cache_only=use_ols_cache_only) elif template == NON_VERTEBRATES_TEMPLATE: - errors = errors + nonvertebrates_chema.validate(self) + errors = errors + nonvertebrates_chema.validate(self, use_ols_cache_only=use_ols_cache_only) elif template == PLANTS_TEMPLATE: - errors = errors + plants_chema.validate(self) + errors = errors + plants_chema.validate(self, use_ols_cache_only=use_ols_cache_only) elif template == CELL_LINES_TEMPLATE: - errors = errors + cell_lines_schema.validate(self) + errors = errors + cell_lines_schema.validate(self, use_ols_cache_only=use_ols_cache_only) elif template == MASS_SPECTROMETRY: - errors = mass_spectrometry_schema.validate(self) + errors = mass_spectrometry_schema.validate(self, use_ols_cache_only=use_ols_cache_only) return errors diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index 381ce84..9d85918 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -92,6 +92,11 @@ def validate_optional(self, series): warnings.append(w) return warnings + def set_ols_strategy(self, use_ols_cache_only: bool = False): + for validation in self.validations: + if isinstance(validation, OntologyTerm): + validation.set_ols_strategy(use_ols_cache_only=use_ols_cache_only) + class OntologyTerm(_SeriesValidation): """ @@ -100,6 +105,7 @@ class OntologyTerm(_SeriesValidation): def __init__(self, ontology_name: str = None, not_available: bool = False, not_applicable: bool = False, **kwargs): super().__init__(**kwargs) + self._use_ols_cache_only = False self._ontology_name = ontology_name self._not_available = not_available self._not_applicable = not_applicable @@ -140,9 +146,16 @@ def validate(self, series: pd.Series) -> pd.Series: ontology_terms = None else: if self._ontology_name is not None: - ontology_terms = client.search(term[TERM_NAME], ontology=self._ontology_name, exact="true") + ontology_terms = client.search( + term[TERM_NAME], + ontology=self._ontology_name, + exact="true", + use_ols_cache_only=self._use_ols_cache_only, + ) else: - ontology_terms = client.search(term[TERM_NAME], exact="true") + ontology_terms = client.search( + term=term[TERM_NAME], exact="true", use_cache_only=self._use_ols_cache_only + ) if ontology_terms is not None: query_labels = [o["label"].lower() for o in ontology_terms] @@ -154,6 +167,13 @@ def validate(self, series: pd.Series) -> pd.Series: labels.append(NOT_APPLICABLE) return series.apply(lambda cell_value: self.validate_ontology_terms(cell_value, labels)) + def set_ols_strategy(self, use_ols_cache_only: bool = False): + """ + Set the strategy to use the OLS cache only + :param use_ols_cache_only: boolean + """ + self._use_ols_cache_only = use_ols_cache_only + class SDRFSchema(Schema): _special_columns = {"sourcename", "assayname", "materialtype", "technologytype"} @@ -168,7 +188,7 @@ def __new__(cls, ordered: bool = False, min_columns: int = 0) -> Any: obj._min_columns = min_columns return obj - def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]: + def validate(self, panda_sdrf: sdrf = None, use_ols_cache_only: bool = False) -> typing.List[LogicError]: errors = [] # Check the minimum number of columns @@ -195,7 +215,7 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]: errors.extend(error_columns_order) # Check that the term is present in ontology - error_ontology_terms = self.validate_columns(panda_sdrf) + error_ontology_terms = self.validate_columns(panda_sdrf, use_ols_cache_only=use_ols_cache_only) if error_ontology_terms is not None: for error in error_ontology_terms: errors.append(error) @@ -301,10 +321,11 @@ def _get_column_pairs(self, panda_sdrf): column_pairs.append((panda_sdrf[column.name], column)) return column_pairs, errors - def validate_columns(self, panda_sdrf): + def validate_columns(self, panda_sdrf, use_ols_cache_only: bool = False): # Iterate over each pair of schema columns and data frame series and run validations column_pairs, errors = self._get_column_pairs(panda_sdrf) for series, column in column_pairs: + column.set_ols_strategy(use_ols_cache_only=use_ols_cache_only) errors += column.validate(series) return sorted(errors, key=lambda e: e.row) diff --git a/setup.py b/setup.py index cae80cb..dcbaf19 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def get_version(rel_path): data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])], package_data={ "sdrf-pipelines": ["*.xml", "*.parquet", "*.yml"], + "sdrf_pipelines": ["*.xml", "*.parquet", "*.yml"], }, url="https://github.com/bigbio/sdrf-pipelines", packages=find_packages(), diff --git a/tests/test_ontology.py b/tests/test_ontology.py index 96d9e9a..18f2757 100644 --- a/tests/test_ontology.py +++ b/tests/test_ontology.py @@ -8,6 +8,16 @@ def test_ontology(): assert len(ontology_list) > 0 +def test_ontology_cache(): + ols = OlsClient() + ontology_list = ols.ols_search( + "homo sapiens", + ontology="NCBITaxon", + ) + print(ontology_list) + assert len(ontology_list) > 0 + + def test_ontology_from_cache(): ols = OlsClient() ontology_list = ols.cache_search("homo sapiens", ontology="NCBITaxon")