Move default cache location to /tmp/nxontology-ml/cache

related-sciences · Oct 20, 2023 · 834a3fa · 834a3fa
1 parent 1b59233
commit 834a3fa
Show file tree

Hide file tree

Showing 25 changed files with 150 additions and 124 deletions.
diff --git a/experimentation/embeddings_warmup.py b/experimentation/embeddings_warmup.py
@@ -5,6 +5,7 @@
     DEFAULT_EMBEDDING_MODEL,
     AutoModelEmbeddings,
 )
+from nxontology_ml.utils import CACHE_DIR
 
 
 def warmup_cache(
@@ -14,6 +15,7 @@ def warmup_cache(
     # Warm up the embedding cache
     ame = ame or AutoModelEmbeddings.from_pretrained(
         DEFAULT_EMBEDDING_MODEL,
+        cache_dir=CACHE_DIR,
     )
     nxo = get_efo_otar_slim()
     X, _ = read_training_data(take=take)

diff --git a/experimentation/model_runner.py b/experimentation/model_runner.py
@@ -139,12 +139,7 @@ def run_experiments(
                 SubsetsFeatures(enabled=exp.subsets_enabled),
                 TherapeuticAreaFeatures(enabled=exp.ta_enabled),
                 GptTagFeatures.from_config(exp.gpt_tagger_config),
-                TextEmbeddingsTransformer.from_config(
-                    enabled=exp.embedding_enabled,
-                    pca_components=exp.pca_components,
-                    use_lda=exp.use_lda,
-                    embedding_model=ame,
-                ),
+                TextEmbeddingsTransformer.from_config(conf=exp, embedding_model=ame),
                 CatBoostDataFormatter(),
             )
             mmb.steps_from_pipeline(feature_pipeline)

diff --git a/experimentation/tests/embeddings_warmup_test.py b/experimentation/tests/embeddings_warmup_test.py
@@ -9,8 +9,7 @@
 def test_warmup_cache() -> None:
     ame = AutoModelEmbeddings.from_pretrained(
         DEFAULT_EMBEDDING_MODEL,
-        cache_path=ROOT_DIR
-        / "nxontology_ml/text_embeddings/tests/test_resources/embeddings_cache.ldb",
+        cache_dir=ROOT_DIR / "nxontology_ml/text_embeddings/tests/test_resources",
     )
     warmup_cache(ame=ame, take=10)
     assert dict(ame._counter) == {"AutoModelEmbeddings/CACHE_HIT": 10}
diff --git a/experimentation/tests/gpt_tags_warmup_test.py b/experimentation/tests/gpt_tags_warmup_test.py
@@ -1,12 +1,14 @@
 from experimentation.gpt_tags_warmup import warmup_gpt_tags
+from nxontology_ml.gpt_tagger import TaskConfig
 from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger
 
 
-def test_warmup_gpt_tags() -> None:
+def test_warmup_gpt_tags(precision_config: TaskConfig) -> None:
     tagger = mk_test_gpt_tagger(
+        config=precision_config,
         cache_content={
             "/a93f3eabc24f867ae4f1d6b371ba6734e38ea0a4": b'["medium"]',
-        }
+        },
     )
     warmup_gpt_tags(
         tagger=tagger,

diff --git a/experimentation/tests/model_runner_test.py b/experimentation/tests/model_runner_test.py
@@ -9,7 +9,9 @@
 
 
 def test_run_experiments(tmp_path: Path) -> None:
-    ame = AutoModelEmbeddings.from_pretrained(DEFAULT_EMBEDDING_MODEL)
+    ame = AutoModelEmbeddings.from_pretrained(
+        DEFAULT_EMBEDDING_MODEL, cache_dir=tmp_path
+    )
     experiments = [
         ModelConfig(
             eval_metric="BiasedMaeMetric",

diff --git a/nxontology_ml/gpt_tagger/_cache.py b/nxontology_ml/gpt_tagger/_cache.py
@@ -10,7 +10,6 @@
 
 from nxontology_ml.gpt_tagger._models import TaskConfig
 from nxontology_ml.gpt_tagger._utils import config_to_cache_namespace, counter_or_empty
-from nxontology_ml.utils import ROOT_DIR
 
 
 class _Cache:
@@ -70,12 +69,10 @@ def from_config(
         cls,
         config: TaskConfig,
         counter: Counter[str] | None = None,
-        cache_path: Path | None = None,
     ) -> "_Cache":
         cache_namespace = config_to_cache_namespace(config)
-        if not cache_path:
-            cache_path = ROOT_DIR / f".cache/{cache_namespace}.ldb"
-            cache_path.parent.mkdir(parents=True, exist_ok=True)
+        config.cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_path = config.cache_dir / f"{cache_namespace}.ldb"
         return cls(
             storage=LazyLSM(cache_path.as_posix()),
             namespace="",  # Namespace is already part of the storage path

diff --git a/nxontology_ml/gpt_tagger/_models.py b/nxontology_ml/gpt_tagger/_models.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-from nxontology_ml.utils import ROOT_DIR
+from nxontology_ml.utils import CACHE_DIR, ROOT_DIR
 
 LOG_DIR = ROOT_DIR / "logs/openai-api"
 
@@ -70,6 +70,8 @@ class TaskConfig:
     # Optionally persist logs to disk
     logs_path: Path | None = LOG_DIR
 
+    cache_dir: Path = CACHE_DIR
+
 
 @dataclass
 class LabelledNode:

diff --git a/nxontology_ml/gpt_tagger/tests/_cache_test.py b/nxontology_ml/gpt_tagger/tests/_cache_test.py
@@ -1,33 +1,38 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
+from nxontology_ml.gpt_tagger import TaskConfig
 from nxontology_ml.gpt_tagger._cache import LazyLSM, _Cache
-from nxontology_ml.gpt_tagger.tests._utils import precision_config
 from nxontology_ml.utils import ROOT_DIR
 
 
 def test_from_config() -> None:
-    expected_cache_path = ROOT_DIR / ".cache/precision_v1_n1.ldb"
-    cache = _Cache.from_config(precision_config)
+    config = TaskConfig(
+        name="precision",
+        prompt_path=ROOT_DIR / "prompts/precision_v1.txt",
+        openai_model_name="gpt-4",
+        node_attributes=["efo_id", "efo_label", "efo_definition"],
+        model_n=3,
+    )
+    expected_cache_path = Path("/tmp/nxontology-ml/cache/precision_v1_n3.ldb")
+    cache = _Cache.from_config(config)
     assert isinstance(cache._storage, LazyLSM)
     assert Path(cache._storage._filename) == expected_cache_path
     assert cache._key_hash_fn == "sha1"
     assert cache._namespace == ""
 
 
-def test_main() -> None:
-    with TemporaryDirectory() as tmpdir:
-        cache_path = Path(tmpdir) / "precision_v1_n1.ldb"
-        cache = _Cache.from_config(precision_config, cache_path=cache_path)
+def test_main(precision_config: TaskConfig) -> None:
+    cache = _Cache.from_config(precision_config)
 
-        assert cache.get("KEY", "DEFAULT") == "DEFAULT"
-        cache["KEY"] = "value"
-        assert cache.get("KEY", "DEFAULT") == "value"
+    assert cache.get("KEY", "DEFAULT") == "DEFAULT"
+    cache["KEY"] = "value"
+    assert cache.get("KEY", "DEFAULT") == "value"
 
-        cache2 = _Cache.from_config(precision_config, cache_path=cache_path)
-        cache2["KEY"] = "value"
-        del cache2["KEY"]
-        assert cache2.get("KEY", "DEFAULT") == "DEFAULT"
+    cache2 = _Cache.from_config(precision_config)
+    cache2["KEY"] = "value"
+    del cache2["KEY"]
+    assert cache2.get("KEY", "DEFAULT") == "DEFAULT"
 
 
 def test_LazyLSM() -> None:

diff --git a/nxontology_ml/gpt_tagger/tests/_chat_completion_middleware_test.py b/nxontology_ml/gpt_tagger/tests/_chat_completion_middleware_test.py
@@ -8,6 +8,7 @@
 import pytest
 from _pytest._py.path import LocalPath
 
+from nxontology_ml.gpt_tagger import TaskConfig
 from nxontology_ml.gpt_tagger._chat_completion_middleware import (
     _ChatCompletionMiddleware,
 )
@@ -19,7 +20,6 @@
 from nxontology_ml.gpt_tagger._utils import node_to_str_fn
 from nxontology_ml.gpt_tagger.tests._utils import (
     mk_stub_ccm,
-    precision_config,
     sanitize_json_format,
 )
 from nxontology_ml.tests.utils import get_test_nodes, read_test_resource
@@ -67,7 +67,7 @@ def test_ctor_verify() -> None:
         _mk_test_ccm(prompt_template="foo")
 
 
-def test_create(tmpdir: LocalPath) -> None:
+def test_create(tmpdir: LocalPath, precision_config: TaskConfig) -> None:
     logdir = Path(tmpdir) / "logs"
     config = copy(precision_config)
     config.model_temperature = 1
@@ -103,7 +103,7 @@ def test_create(tmpdir: LocalPath) -> None:
     assert sanitize_json_format(resp_file.read_text()) == json_resp
 
 
-def test_from_config() -> None:
+def test_from_config(precision_config: TaskConfig) -> None:
     ccm = _ChatCompletionMiddleware.from_config(precision_config)
     assert ccm._partial_payload["model"] == "gpt-3.5-turbo"
     assert ccm._partial_payload["messages"][0]["content"] == "__PLACEHOLDER__"
diff --git a/nxontology_ml/gpt_tagger/tests/_features_test.py b/nxontology_ml/gpt_tagger/tests/_features_test.py
@@ -8,10 +8,10 @@
 
 from nxontology_ml.data import get_efo_otar_slim
 from nxontology_ml.features import PrepareNodeFeatures
-from nxontology_ml.gpt_tagger import GptTagger
+from nxontology_ml.gpt_tagger import GptTagger, TaskConfig
 from nxontology_ml.gpt_tagger._features import DEFAULT_CONF, GptTagFeatures
 from nxontology_ml.gpt_tagger._openai_models import Response
-from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger, precision_config
+from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger
 from nxontology_ml.sklearn_transformer import NodeFeatures
 from nxontology_ml.tests.utils import read_test_resource
 from nxontology_ml.utils import ROOT_DIR
@@ -27,13 +27,19 @@ def sampled_nxo() -> NXOntology[str]:
 
 
 @pytest.fixture
-def tagger() -> GptTagger:
+def tagger(precision_config: TaskConfig) -> GptTagger:
     expected_req = read_test_resource("precision_payload.json")
     stub_resp = Response(**json.loads(read_test_resource("precision_resp.json")))  # type: ignore[misc]
-    return mk_test_gpt_tagger(stub_content={expected_req: stub_resp}, cache_content={})
+    return mk_test_gpt_tagger(
+        config=precision_config,
+        stub_content={expected_req: stub_resp},
+        cache_content={},
+    )
 
 
-def test_transform(tagger: GptTagger, sampled_nxo: NXOntology[str]) -> None:
+def test_transform(
+    tagger: GptTagger, sampled_nxo: NXOntology[str], precision_config: TaskConfig
+) -> None:
     p = make_pipeline(
         PrepareNodeFeatures(sampled_nxo),
         GptTagFeatures(
@@ -50,7 +56,9 @@ def test_transform(tagger: GptTagger, sampled_nxo: NXOntology[str]) -> None:
     assert_frame_equal(df, expected_df)
 
 
-def test_disabled(tagger: GptTagger, sampled_nxo: NXOntology[str]) -> None:
+def test_disabled(
+    tagger: GptTagger, sampled_nxo: NXOntology[str], precision_config: TaskConfig
+) -> None:
     p = make_pipeline(
         PrepareNodeFeatures(sampled_nxo),
         GptTagFeatures(

diff --git a/nxontology_ml/gpt_tagger/tests/_gpt_tagger_test.py b/nxontology_ml/gpt_tagger/tests/_gpt_tagger_test.py
@@ -9,15 +9,15 @@
 
 from nxontology_ml.data import get_efo_otar_slim
 from nxontology_ml.gpt_tagger._gpt_tagger import GptTagger
-from nxontology_ml.gpt_tagger._models import LabelledNode
+from nxontology_ml.gpt_tagger._models import LabelledNode, TaskConfig
 from nxontology_ml.gpt_tagger._openai_models import Response
-from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger, precision_config
+from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger
 from nxontology_ml.tests.utils import get_test_nodes, read_test_resource
 
 
-def test_fetch_labels() -> None:
+def test_fetch_labels(precision_config: TaskConfig) -> None:
     cache_content: dict[str, bytes] = {}
-    tagger = mk_test_gpt_tagger(cache_content)
+    tagger = mk_test_gpt_tagger(precision_config, cache_content)
     labels = tagger.fetch_labels(get_test_nodes())
     assert list(labels) == [
         LabelledNode(node_efo_id="DOID:0050890", labels=["medium"]),
@@ -41,13 +41,13 @@ def test_fetch_labels() -> None:
     }
 
 
-def test_fetch_labels_cached() -> None:
+def test_fetch_labels_cached(precision_config: TaskConfig) -> None:
     # Pre-loaded cache
     cache_content = {
         "/7665404d4f2728a09ed26b8ebf2b3be612bd7da2": b'["medium"]',
         "/962b25d69f79f600f23a17e2c3fe79948013b4de": b'["medium"]',
     }
-    tagger = mk_test_gpt_tagger(cache_content)
+    tagger = mk_test_gpt_tagger(precision_config, cache_content)
     labels = tagger.fetch_labels(get_test_nodes())
     assert list(labels) == [
         LabelledNode(node_efo_id="DOID:0050890", labels=["medium"]),
@@ -56,13 +56,13 @@ def test_fetch_labels_cached() -> None:
     assert tagger.get_metrics() == Counter({"Cache/get": 2, "Cache/hits": 2})
 
 
-def test_fetch_many_records() -> None:
+def test_fetch_many_records(precision_config: TaskConfig) -> None:
     # Disable caching
     class PassthroughDict(dict[str, bytes]):
         def __setitem__(self, key: str, value: bytes) -> None:
             return
 
-    tagger = mk_test_gpt_tagger(cache_content=PassthroughDict())
+    tagger = mk_test_gpt_tagger(precision_config, cache_content=PassthroughDict())
 
     def _r(n: int) -> Response:
         r = json.loads(read_test_resource("precision_resp.json"))
@@ -92,8 +92,8 @@ def _r(n: int) -> Response:
     )
 
 
-def test_get_metrics() -> None:
-    tagger = mk_test_gpt_tagger(cache_content={})
+def test_get_metrics(precision_config: TaskConfig) -> None:
+    tagger = mk_test_gpt_tagger(precision_config, cache_content={})
     tagger._counter["test"] += 42
 
     # Defensive copy: No effect
@@ -107,7 +107,7 @@ def test_get_metrics() -> None:
     assert tagger.get_metrics() == Counter({"test": 43})
 
 
-def test_from_config() -> None:
+def test_from_config(precision_config: TaskConfig) -> None:
     counter: Counter[str] = Counter()
     tagger = GptTagger.from_config(precision_config, counter=counter)
 
@@ -118,13 +118,15 @@ def test_from_config() -> None:
     assert id(tagger._cache._counter) == counter_id
 
 
-def test_resp_truncated() -> None:
+def test_resp_truncated(precision_config: TaskConfig) -> None:
     stub_resp = Response(**json.loads(read_test_resource("precision_resp.json")))  # type: ignore[misc]
     assert stub_resp["choices"][0]["finish_reason"] == "stop"
     stub_resp["choices"][0]["finish_reason"] = "length"  # Simulate resp truncation
     expected_req = read_test_resource("precision_payload.json")
     tagger = mk_test_gpt_tagger(
-        stub_content={expected_req: stub_resp}, cache_content={}
+        config=precision_config,
+        stub_content={expected_req: stub_resp},
+        cache_content={},
     )
     with pytest.raises(
         ValueError,
@@ -142,11 +144,13 @@ def _assert_user_warning_starts_with(warn: WarningMessage, s: str) -> None:
     assert warn_msg.startswith(s)
 
 
-def test_resp_id_mismatch() -> None:
+def test_resp_id_mismatch(precision_config: TaskConfig) -> None:
     expected_req = read_test_resource("mismatch_payload.json")
     stub_resp = Response(**json.loads(read_test_resource("mismatch_resp.json")))  # type: ignore[misc]
     tagger = mk_test_gpt_tagger(
-        stub_content={expected_req: stub_resp}, cache_content={}
+        config=precision_config,
+        stub_content={expected_req: stub_resp},
+        cache_content={},
     )
     nxo = get_efo_otar_slim()
     valid_resp_node = "DOID:0050890"

diff --git a/nxontology_ml/gpt_tagger/tests/_integration_test.py b/nxontology_ml/gpt_tagger/tests/_integration_test.py
@@ -12,13 +12,12 @@
 from nxontology_ml.gpt_tagger._models import TaskConfig
 from nxontology_ml.gpt_tagger._openai_models import Response
 from nxontology_ml.gpt_tagger._utils import node_to_str_fn
-from nxontology_ml.gpt_tagger.tests._utils import precision_config
 from nxontology_ml.tests.utils import get_test_nodes, read_test_resource
 from nxontology_ml.utils import ROOT_DIR
 
 
 @pytest.mark.skip(reason="IT: Makes a real openai api call")
-def test_chat_completion_precision_it() -> None:
+def test_chat_completion_precision_it(precision_config: TaskConfig) -> None:
     # NOTE: Flaky API response, even with temp=0 :(
     # NOTE: Needs an OPENAI_API_KEY setup, see main README.md
     ccm = _ChatCompletionMiddleware.from_config(precision_config)

diff --git a/nxontology_ml/gpt_tagger/tests/_tiktoken_batcher_test.py b/nxontology_ml/gpt_tagger/tests/_tiktoken_batcher_test.py
@@ -6,9 +6,9 @@
 import tiktoken
 from tiktoken import Encoding
 
+from nxontology_ml.gpt_tagger import TaskConfig
 from nxontology_ml.gpt_tagger._openai_models import _4K
 from nxontology_ml.gpt_tagger._tiktoken_batcher import _TiktokenBatcher
-from nxontology_ml.gpt_tagger.tests._utils import precision_config
 from nxontology_ml.tests.utils import get_test_resource_path
 
 
@@ -54,7 +54,9 @@ def test_add_tokens(tiktoken_cl100k_encoding: Encoding) -> None:
         batcher._do_add_record_to_buffer(record)
 
 
-def test_from_config(tiktoken_cl100k_encoding: Encoding) -> None:
+def test_from_config(
+    tiktoken_cl100k_encoding: Encoding, precision_config: TaskConfig
+) -> None:
     # Valid config
     batcher = _TiktokenBatcher.from_config(precision_config)
     assert batcher._tiktoken_encoding == tiktoken_cl100k_encoding