smart-on-fhir · mikix · Feb 6, 2025 · Jan 13, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -45,11 +45,11 @@ jobs:
           coverage report -m --skip-covered
       - name: Generate coverage report
         if: github.ref != 'refs/heads/main'
-        uses: orgoro/coverage@v3.1
+        uses: orgoro/coverage@v3.2
         with:
             coverageFile: coverage.xml
             token: ${{ secrets.GITHUB_TOKEN }}
-            thresholdAll: .9
+            thresholdAll: 1
             thresholdNew: 1
             thresholdModified: 1
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 default_install_hook_types: [pre-commit, pre-push]
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.1  # if you update this, also update pyproject.toml
+    rev: v0.9.4  # if you update this, also update pyproject.toml
     hooks:
       - name: Ruff formatting
         id: ruff-format

diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
@@ -450,7 +450,7 @@ def _query_error(
     exit_message: str,
 ) -> None:
     print(
-        "An error occured executing the following query in ",
+        "An error occurred executing the following query in ",
         f"{query_and_filename[1]}:",
         file=sys.stderr,
     )

diff --git a/cumulus_library/actions/uploader.py b/cumulus_library/actions/uploader.py
@@ -64,9 +64,7 @@ def upload_data(
 def upload_files(args: dict):
     """Wrapper to prep files & console output"""
     if args["data_path"] is None:
-        sys.exit(
-            "No data directory provided - please provide a path to your" "study export folder."
-        )
+        sys.exit("No data directory provided - please provide a path to your study export folder.")
     file_paths = list(args["data_path"].glob("**/*.parquet"))
     if args["target"]:
         filtered_paths = []

diff --git a/cumulus_library/apis/umls.py b/cumulus_library/apis/umls.py
@@ -175,7 +175,7 @@ def download_umls_files(
                     progress.update(
                         task,
                         description=(
-                            f"Downloading {file_meta['fileName']}: " f"{chunks_read/1000} MB"
+                            f"Downloading {file_meta['fileName']}: {chunks_read / 1000} MB"
                         ),
                     )
         if unzip:

diff --git a/cumulus_library/builders/base_table_builder.py b/cumulus_library/builders/base_table_builder.py
@@ -80,9 +80,7 @@ def execute_queries(
                     with base_utils.query_console_output(config.verbose, query, progress, task):
                         cursor.execute(query)
                 except Exception as e:  # pylint: disable=broad-exception-caught
-                    sys.exit(
-                        "An error occured executing this query:\n----\n" f"{query}\n----\n" f"{e}"
-                    )
+                    sys.exit(f"An error occurred executing this query:\n----\n{query}\n----\n{e}")
 
         self.post_execution(config, *args, **kwargs)
 

diff --git a/cumulus_library/builders/counts.py b/cumulus_library/builders/counts.py
@@ -79,7 +79,7 @@ def get_count_query(
         """
         if not table_name or not source_table or not table_cols:
             raise errors.CountsBuilderError(
-                "count_query missing required arguments. " f"output table: {table_name}"
+                f"count_query missing required arguments. output table: {table_name}"
             )
         for key in kwargs:
             if key not in [

diff --git a/cumulus_library/builders/protected_table_builder.py b/cumulus_library/builders/protected_table_builder.py
@@ -54,11 +54,9 @@ def prepare_queries(
         else:
             db_schema = config.schema
             transactions = (
-                f"{manifest.get_study_prefix()}" f"__{enums.ProtectedTables.TRANSACTIONS.value}"
-            )
-            statistics = (
-                f"{manifest.get_study_prefix()}" f"__{enums.ProtectedTables.STATISTICS.value}"
+                f"{manifest.get_study_prefix()}__{enums.ProtectedTables.TRANSACTIONS.value}"
             )
+            statistics = f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}"
         self.queries.append(
             base_templates.get_ctas_empty_query(
                 db_schema,

diff --git a/cumulus_library/builders/psm_builder.py b/cumulus_library/builders/psm_builder.py
@@ -67,8 +67,9 @@ def __init__(self, toml_config_path: str, data_path: pathlib.Path, **kwargs):
         except OSError:
             sys.exit(f"PSM configuration not found at {self.toml_path}")
         try:
+            toml_dir = pathlib.Path(self.toml_path).parent
             self.config = PsmConfig(
-                classification_json=f"{pathlib.Path(self.toml_path).parent}/{toml_config['classification_json']}",
+                classification_json=str(toml_dir.joinpath(toml_config["classification_json"])),
                 pos_source_table=toml_config["pos_source_table"],
                 neg_source_table=toml_config["neg_source_table"],
                 target_table=toml_config["target_table"],
@@ -256,15 +257,15 @@ def psm_effect_size_plot(
     def generate_psm_analysis(
         self, cursor: databases.DatabaseCursor, schema: str, table_suffix: str
     ):
-        stats_table = f"{self.config.target_table}_{table_suffix}"
         """Runs PSM statistics on generated tables"""
+        stats_table = f"{self.config.target_table}_{table_suffix}"
         cursor.execute(base_templates.get_alias_table_query(stats_table, self.config.target_table))
         df = cursor.execute(
             base_templates.get_select_all_query(self.config.target_table)
         ).as_pandas()
         symptoms_dict = self._get_symptoms_dict(self.config.classification_json)
         for dependent_variable, codes in symptoms_dict.items():
-            df[dependent_variable] = df["code"].apply(lambda x, codes=codes: 1 if x in codes else 0)
+            df[dependent_variable] = df["code"].apply(lambda x: 1 if x in codes else 0)
         df = df.drop(columns="code")
         # instance_count present but unused for PSM if table contains a count_ref input
         # (it's intended for manual review)
@@ -336,7 +337,7 @@ def generate_psm_analysis(
             )
         except ValueError:
             sys.exit(
-                "Encountered a value error during KNN matching. Try increasing " "your sample size."
+                "Encountered a value error during KNN matching. Try increasing your sample size."
             )
 
     def prepare_queries(self, config: base_utils.StudyConfig, *args, table_suffix: str, **kwargs):

diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -441,7 +441,7 @@ def main(cli_args=None):
         sys.exit(1)
 
     if args["action"] == "version":
-        print(f"cumulus-library version: {__version__}\n" "Installed studies:")
+        print(f"cumulus-library version: {__version__}\nInstalled studies:")
         studies = get_study_dict(args.get("study_dir"))
         for study in sorted(studies.keys()):
             try:

diff --git a/cumulus_library/databases/athena.py b/cumulus_library/databases/athena.py
@@ -142,7 +142,7 @@ def upload_file(
                 "AWS KMS encryption is expected for Cumulus buckets"
             )
         kms_arn = wg_conf.get("EncryptionConfiguration", {}).get("KmsKey", None)
-        s3_key = f"{key_prefix}cumulus_user_uploads/{self.schema_name}/" f"{study}/{topic}"
+        s3_key = f"{key_prefix}cumulus_user_uploads/{self.schema_name}/{study}/{topic}"
         if not remote_filename:
             remote_filename = file.name
 

diff --git a/cumulus_library/databases/duckdb.py b/cumulus_library/databases/duckdb.py
@@ -191,7 +191,7 @@ def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list:
                     output.append((column[0], pyarrow.timestamp("s")))
                 case _:
                     raise errors.CumulusLibraryError(
-                        f"{column[0],column[1]} does not have a conversion type"
+                        f"{column[0], column[1]} does not have a conversion type"
                     )
         return output
 

diff --git a/cumulus_library/log_utils.py b/cumulus_library/log_utils.py
@@ -109,12 +109,10 @@ def _log_table(
                 alter_query = ""
                 if isinstance(config.db, databases.AthenaDatabaseBackend):
                     alter_query = (
-                        f"ALTER TABLE {db_schema}.{table_name} " "ADD COLUMNS(message string)"
+                        f"ALTER TABLE {db_schema}.{table_name} ADD COLUMNS(message string)"
                     )
                 elif isinstance(config.db, databases.DuckDatabaseBackend):
-                    alter_query = (
-                        f"ALTER TABLE {db_schema}.{table_name} " "ADD COLUMN message varchar"
-                    )
+                    alter_query = f"ALTER TABLE {db_schema}.{table_name} ADD COLUMN message varchar"
                 cursor.execute(alter_query)
                 cursor.execute(query)
         else:

diff --git a/cumulus_library/studies/discovery/code_detection.py b/cumulus_library/studies/discovery/code_detection.py
@@ -49,7 +49,7 @@ def prepare_queries(
         for code_definition in code_definitions.code_list:
             if not required_keys.issubset(code_definition):
                 raise KeyError(
-                    "Expected table_name and column_hierarchy keys in " f"{code_definition!s}"
+                    f"Expected table_name and column_hierarchy keys in {code_definition!s}"
                 )
             code_source = {
                 "has_data": False,

diff --git a/cumulus_library/template_sql/sql_utils.py b/cumulus_library/template_sql/sql_utils.py
@@ -319,8 +319,7 @@ def is_field_populated(
             source_field.append(element[0])
         else:
             raise ValueError(
-                "sql_utils.is_field_populated: Unexpected type "
-                f"{element[1]} for field {element[0]}"
+                f"sql_utils.is_field_populated: Unexpected type {element[1]} for field {element[0]}"
             )
     query = base_templates.get_is_table_not_empty_query(
         source_table=source_table, field=".".join(source_field), unnests=unnests

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dynamic=["version"]
 [project.optional-dependencies]
 dev = [
     # if you update the ruff version, also update .pre-commit-config.yaml
-    "ruff < 0.9",
+    "ruff < 0.10",
     "pre-commit",
 ]
 test = [

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -47,7 +47,7 @@ def get_sorted_table_data(cursor, table):
     if num_cols == 0:
         return [], []
     data = cursor.execute(
-        f"SELECT * FROM '{table}' ORDER BY " f"{','.join(map(str, range(1,num_cols+1)))}"
+        f"SELECT * FROM '{table}' ORDER BY {','.join(map(str, range(1, num_cols + 1)))}"
     ).fetchall()
     return data, cursor.description
 
@@ -158,8 +158,7 @@ def update_nested_obj(id_path, obj, i):
 
 def debug_table_schema(cursor, table):
     table_schema = cursor.execute(
-        "select column_name, data_type from information_schema.columns "
-        f"where table_name='{table}'"
+        f"select column_name, data_type from information_schema.columns where table_name='{table}'"
     ).fetchall()
     for line in table_schema:
         print(line)

diff --git a/tests/core/test_core_observation.py b/tests/core/test_core_observation.py
@@ -185,7 +185,7 @@ def test_core_observation_component(tmp_path):
     ] == rows
 
     df = con.sql(
-        "SELECT * FROM core__observation_component_dataabsentreason " "ORDER BY id, row, code"
+        "SELECT * FROM core__observation_component_dataabsentreason ORDER BY id, row, code"
     ).df()
     rows = json.loads(df.to_json(orient="records"))
     assert [
@@ -216,7 +216,7 @@ def test_core_observation_component(tmp_path):
     ] == rows
 
     df = con.sql(
-        "SELECT * FROM core__observation_component_interpretation " "ORDER BY id, row, code"
+        "SELECT * FROM core__observation_component_interpretation ORDER BY id, row, code"
     ).df()
     rows = json.loads(df.to_json(orient="records"))
     assert [
@@ -263,7 +263,7 @@ def test_core_observation_component(tmp_path):
     ] == rows
 
     df = con.sql(
-        "SELECT * FROM core__observation_component_valuecodeableconcept " "ORDER BY id, row, code"
+        "SELECT * FROM core__observation_component_valuecodeableconcept ORDER BY id, row, code"
     ).df()
     rows = json.loads(df.to_json(orient="records"))
     assert [
@@ -286,7 +286,7 @@ def test_core_observation_component(tmp_path):
     ] == rows
 
     df = con.sql(
-        "SELECT * FROM core__observation_component_valuequantity " "ORDER BY id, row, code"
+        "SELECT * FROM core__observation_component_valuequantity ORDER BY id, row, code"
     ).df()
     rows = json.loads(df.to_json(orient="records"))
     assert [

diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 from unittest import mock
 
+import duckdb
 import pyathena
 import pytest
 from freezegun import freeze_time
@@ -160,11 +161,24 @@ def test_migrate_transactions_athena(mock_pyathena):
         status="debug",
         message="message",
     )
-    expected = "ALTER TABLE test.study_valid__lib_transactions" " ADD COLUMNS(message string)"
+    expected = "ALTER TABLE test.study_valid__lib_transactions ADD COLUMNS(message string)"
     call_args = mock_pyathena.return_value.cursor.return_value.execute.call_args_list
     assert expected == call_args[2][0][0]
 
 
+def test_statistics_failure_gets_raised(mock_db_config):
+    mock_db_config.db.cursor = mock.MagicMock()
+    mock_db_config.db.cursor.return_value.execute.side_effect = duckdb.OperationalError
+    with pytest.raises(duckdb.OperationalError):
+        log_utils.log_statistics(
+            config=mock_db_config,
+            manifest=study_manifest.StudyManifest(),
+            table_type="test1",
+            table_name="test2",
+            view_name="test3",
+        )
+
+
 @freeze_time("2024-01-01")
 @pytest.mark.parametrize(
     "schema,study,table_type,table_name,view_type,expects,raises",

diff --git a/tests/test_psm.py b/tests/test_psm.py
@@ -132,3 +132,50 @@ def test_psm_error_handling(mock_psm, error, tmp_path, mock_db_stats_config):
             drop_table=True,
             table_suffix=safe_timestamp,
         )
+
+
+def test_psm_missing_file(tmp_path):
+    with pytest.raises(SystemExit, match="PSM configuration not found"):
+        psm_builder.PsmBuilder(f"{tmp_path}/does-not-exist.toml", pathlib.Path(tmp_path))
+
+
+def test_psm_missing_keys(tmp_path):
+    toml_file = pathlib.Path(f"{tmp_path}/empty.toml")
+    toml_file.touch()
+    with pytest.raises(SystemExit, match="contains missing/invalid keys"):
+        psm_builder.PsmBuilder(str(toml_file), pathlib.Path(tmp_path))
+
+
+def test_psm_bad_include_cols(tmp_path, mock_db_stats_config):
+    """Provide too many include_cols"""
+    psm_root = f"{pathlib.Path(__file__).parent}/test_data/psm/"
+    with open(f"{tmp_path}/psm.toml", "w", encoding="utf8") as f:
+        f.write(f"""config_type = "psm"
+classification_json = "{psm_root}/dsm5_classifications.json"
+pos_source_table = "psm_test__psm_cohort"
+neg_source_table = "core__condition"
+target_table = "psm_test__psm_encounter_covariate"
+primary_ref = 'encounter_ref'
+dependent_variable = "example_diagnosis"
+pos_sample_size = 20
+neg_sample_size = 100
+[join_cols_by_table.core__encounter]
+join_id = "encounter_ref"
+included_cols = [
+    ["race_display", "race", "age_at_visit"],  # too many columns
+]
+""")
+    builder = cli.StudyRunner(mock_db_stats_config, data_path=tmp_path)
+    manifest = study_manifest.StudyManifest(study_path=psm_root)
+    psmbuilder = psm_builder.PsmBuilder(f"{tmp_path}/psm.toml", tmp_path)
+    builder.config.db.cursor().execute(
+        "create table psm_test__psm_cohort as (select * from core__condition "
+        f"ORDER BY {psmbuilder.config.primary_ref} limit 100)"
+    )
+    with pytest.raises(SystemExit, match="unexpected SQL column definition"):
+        psmbuilder.execute_queries(
+            builder.config,
+            manifest,
+            drop_table=True,
+            table_suffix="test",
+        )
diff --git a/tests/valueset/test_additional_rules_builder.py b/tests/valueset/test_additional_rules_builder.py
@@ -181,7 +181,7 @@ def test_additional_rules(mock_api, mock_db_config_rxnorm, prefix):
     ]:
         res = cursor.execute(
             f"Select * from {table_conf['name']} order by "
-            f"{','.join([str(x+1) for x in range(table_conf['columns'])])}"
+            f"{','.join([str(x + 1) for x in range(table_conf['columns'])])}"
         )
         data = res.fetchall()
         assert len(data) == table_conf["count"]