Merge pull request #171 from bigbio/dev

skip ols internet check, use instead local parquet files
bigbio · Aug 5, 2024 · 528db8c · 528db8c
2 parents b58f9dd + b445e77
commit 528db8c
Show file tree

Hide file tree

Showing 12 changed files with 72 additions and 30 deletions.
diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
@@ -38,23 +38,24 @@ jobs:
         source $CONDA/etc/profile.d/conda.sh
         conda activate sdrf-pipelines
         conda build recipe
+        PACKAGE_PATH=$(find /usr/share/miniconda/envs/sdrf-pipelines/conda-bld/noarch/ -name "sdrf-pipelines-*.tar.bz2" | head -n 1)
+        conda install --offline "$PACKAGE_PATH"
       shell: bash -l {0}
 
-    - name: Install the built package
+    - name: Test the installed package
       run: |
-        source $CONDA/etc/profile.d/conda.sh
         conda activate sdrf-pipelines
-        conda install --use-local sdrf-pipelines
+        parse_sdrf --help
       shell: bash -l {0}
 
-    - name: Test the installed package
+    - name: Test validation of SDRF file
       run: |
         conda activate sdrf-pipelines
-        parse_sdrf --help
+        parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv
       shell: bash -l {0}
 
-    - name: Test other commands
+    - name: Test validation of SDRF file with cache only
       run: |
         conda activate sdrf-pipelines
-        parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --check_ms
-      shell: bash -l {0}
+        parse_sdrf validate-sdrf --sdrf_file tests/data/reference/PDC000126/PDC000126.sdrf.tsv --use_ols_cache_only
+      shell: bash -l {0}
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -5,4 +5,6 @@ include README.md
 include LICENSE
 
 # Include the data files
-recursive-include sdrf_pipelines *.xml *.yml
+recursive-include sdrf_pipelines *.xml *.yml *.parquet
+recursive-include sdrf_pipelines *.parquet
+include sdrf_pipelines/ols/*.parquet
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,7 +1,7 @@
 # recipe/meta.yaml
 package:
   name: sdrf-pipelines
-  version: "0.0.27"
+  version: "0.0.29"
 
 source:
   path: ../

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 pandas
 click
-
 # https://github.com/multimeric/PandasSchema does not seem to be maintained
 # anymore, and right now it thows a pandas deprecation warning, in the
 # future it will stop working.

diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.28"
+__version__ = "0.0.29"
diff --git a/sdrf_pipelines/ols/ols.py b/sdrf_pipelines/ols/ols.py
@@ -275,16 +275,19 @@ def get_ancestors(self, ont, iri):
             logger.warning("Term was found but ancestor lookup returned an empty response: %s", response.json())
             raise ex
 
-    def search(self, term: str, ontology: str, exact=True, **kwargs):
+    def search(self, term: str, ontology: str = None, exact=True, use_ols_cache_only: bool = False, **kwargs):
         """
         Search a term in the OLS
         @:param term: The name of the term
         @:param ontology: The name of the ontology
         @:param exact: Forces exact match if not `None`
         """
-        terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs)
-        if terms is None and self.use_cache:
+        if use_ols_cache_only:
             terms = self.cache_search(term, ontology)
+        else:
+            terms = self.ols_search(term, ontology=ontology, exact=exact, **kwargs)
+            if terms is None and self.use_cache:
+                terms = self.cache_search(term, ontology)
         return terms
 
     def _perform_ols_search(self, params, name, exact, retry_num=0):

diff --git a/sdrf_pipelines/ols/unimod.parquet b/sdrf_pipelines/ols/unimod.parquet
diff --git a/sdrf_pipelines/parse_sdrf.py b/sdrf_pipelines/parse_sdrf.py
@@ -145,6 +145,9 @@ def maxquant_from_sdrf(
 @click.option(
     "--skip_experimental_design_validation", help="Disable the validation of experimental design", is_flag=True
 )
+@click.option(
+    "--use_ols_cache_only", help="Use ols cache for validation of the terms and not OLS internet service", is_flag=True
+)
 @click.pass_context
 def validate_sdrf(
     ctx,
@@ -153,6 +156,7 @@ def validate_sdrf(
     skip_ms_validation: bool,
     skip_factor_validation: bool,
     skip_experimental_design_validation: bool,
+    use_ols_cache_only: bool,
 ):
     """
     Command to validate the SDRF file. The validation is based on the template provided by the user.
@@ -165,6 +169,7 @@ def validate_sdrf(
     @param skip_ms_validation: flag to skip the validation of mass spectrometry fields
     @param skip_factor_validation: flag to skip the validation of factor values
     @param skip_experimental_design_validation: flag to skip the validation of experimental design
+    @param use_ols_cache_only: flag to use the OLS cache for validation of the terms and not OLS internet service
     """
 
     if sdrf_file is None:
@@ -176,10 +181,10 @@ def validate_sdrf(
         template = DEFAULT_TEMPLATE
 
     df = SdrfDataFrame.parse(sdrf_file)
-    errors = df.validate(template)
+    errors = df.validate(template, use_ols_cache_only)
 
     if not skip_ms_validation:
-        errors = errors + df.validate(MASS_SPECTROMETRY)
+        errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only)
 
     if not skip_factor_validation:
         errors = errors + df.validate_factor_values()

diff --git a/sdrf_pipelines/sdrf/sdrf.py b/sdrf_pipelines/sdrf/sdrf.py
@@ -69,27 +69,27 @@ def parse(sdrf_file: str):
 
         return SdrfDataFrame(df)
 
-    def validate(self, template: str):
+    def validate(self, template: str, use_ols_cache_only: bool = False) -> List[LogicError]:
         """
         Validate a corresponding SDRF
         :return:
         """
         errors = []
         if template != MASS_SPECTROMETRY:
-            errors = default_schema.validate(self)
+            errors = default_schema.validate(self, use_ols_cache_only=use_ols_cache_only)
 
         if template == HUMAN_TEMPLATE:
-            errors = errors + human_schema.validate(self)
+            errors = errors + human_schema.validate(self, use_ols_cache_only=use_ols_cache_only)
         elif template == VERTEBRATES_TEMPLATE:
-            errors = errors + vertebrates_chema.validate(self)
+            errors = errors + vertebrates_chema.validate(self, use_ols_cache_only=use_ols_cache_only)
         elif template == NON_VERTEBRATES_TEMPLATE:
-            errors = errors + nonvertebrates_chema.validate(self)
+            errors = errors + nonvertebrates_chema.validate(self, use_ols_cache_only=use_ols_cache_only)
         elif template == PLANTS_TEMPLATE:
-            errors = errors + plants_chema.validate(self)
+            errors = errors + plants_chema.validate(self, use_ols_cache_only=use_ols_cache_only)
         elif template == CELL_LINES_TEMPLATE:
-            errors = errors + cell_lines_schema.validate(self)
+            errors = errors + cell_lines_schema.validate(self, use_ols_cache_only=use_ols_cache_only)
         elif template == MASS_SPECTROMETRY:
-            errors = mass_spectrometry_schema.validate(self)
+            errors = mass_spectrometry_schema.validate(self, use_ols_cache_only=use_ols_cache_only)
 
         return errors
 

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -92,6 +92,11 @@ def validate_optional(self, series):
                 warnings.append(w)
         return warnings
 
+    def set_ols_strategy(self, use_ols_cache_only: bool = False):
+        for validation in self.validations:
+            if isinstance(validation, OntologyTerm):
+                validation.set_ols_strategy(use_ols_cache_only=use_ols_cache_only)
+
 
 class OntologyTerm(_SeriesValidation):
     """
@@ -100,6 +105,7 @@ class OntologyTerm(_SeriesValidation):
 
     def __init__(self, ontology_name: str = None, not_available: bool = False, not_applicable: bool = False, **kwargs):
         super().__init__(**kwargs)
+        self._use_ols_cache_only = False
         self._ontology_name = ontology_name
         self._not_available = not_available
         self._not_applicable = not_applicable
@@ -140,9 +146,16 @@ def validate(self, series: pd.Series) -> pd.Series:
                 ontology_terms = None
             else:
                 if self._ontology_name is not None:
-                    ontology_terms = client.search(term[TERM_NAME], ontology=self._ontology_name, exact="true")
+                    ontology_terms = client.search(
+                        term[TERM_NAME],
+                        ontology=self._ontology_name,
+                        exact="true",
+                        use_ols_cache_only=self._use_ols_cache_only,
+                    )
                 else:
-                    ontology_terms = client.search(term[TERM_NAME], exact="true")
+                    ontology_terms = client.search(
+                        term=term[TERM_NAME], exact="true", use_cache_only=self._use_ols_cache_only
+                    )
 
             if ontology_terms is not None:
                 query_labels = [o["label"].lower() for o in ontology_terms]
@@ -154,6 +167,13 @@ def validate(self, series: pd.Series) -> pd.Series:
             labels.append(NOT_APPLICABLE)
         return series.apply(lambda cell_value: self.validate_ontology_terms(cell_value, labels))
 
+    def set_ols_strategy(self, use_ols_cache_only: bool = False):
+        """
+        Set the strategy to use the OLS cache only
+        :param use_ols_cache_only: boolean
+        """
+        self._use_ols_cache_only = use_ols_cache_only
+
 
 class SDRFSchema(Schema):
     _special_columns = {"sourcename", "assayname", "materialtype", "technologytype"}
@@ -168,7 +188,7 @@ def __new__(cls, ordered: bool = False, min_columns: int = 0) -> Any:
         obj._min_columns = min_columns
         return obj
 
-    def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
+    def validate(self, panda_sdrf: sdrf = None, use_ols_cache_only: bool = False) -> typing.List[LogicError]:
         errors = []
 
         # Check the minimum number of columns
@@ -195,7 +215,7 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
             errors.extend(error_columns_order)
 
         # Check that the term is present in ontology
-        error_ontology_terms = self.validate_columns(panda_sdrf)
+        error_ontology_terms = self.validate_columns(panda_sdrf, use_ols_cache_only=use_ols_cache_only)
         if error_ontology_terms is not None:
             for error in error_ontology_terms:
                 errors.append(error)
@@ -301,10 +321,11 @@ def _get_column_pairs(self, panda_sdrf):
                 column_pairs.append((panda_sdrf[column.name], column))
         return column_pairs, errors
 
-    def validate_columns(self, panda_sdrf):
+    def validate_columns(self, panda_sdrf, use_ols_cache_only: bool = False):
         # Iterate over each pair of schema columns and data frame series and run validations
         column_pairs, errors = self._get_column_pairs(panda_sdrf)
         for series, column in column_pairs:
+            column.set_ols_strategy(use_ols_cache_only=use_ols_cache_only)
             errors += column.validate(series)
         return sorted(errors, key=lambda e: e.row)
 

diff --git a/setup.py b/setup.py
@@ -34,6 +34,7 @@ def get_version(rel_path):
     data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])],
     package_data={
         "sdrf-pipelines": ["*.xml", "*.parquet", "*.yml"],
+        "sdrf_pipelines": ["*.xml", "*.parquet", "*.yml"],
     },
     url="https://github.com/bigbio/sdrf-pipelines",
     packages=find_packages(),

diff --git a/tests/test_ontology.py b/tests/test_ontology.py
@@ -8,6 +8,16 @@ def test_ontology():
     assert len(ontology_list) > 0
 
 
+def test_ontology_cache():
+    ols = OlsClient()
+    ontology_list = ols.ols_search(
+        "homo sapiens",
+        ontology="NCBITaxon",
+    )
+    print(ontology_list)
+    assert len(ontology_list) > 0
+
+
 def test_ontology_from_cache():
     ols = OlsClient()
     ontology_list = ols.cache_search("homo sapiens", ontology="NCBITaxon")