Skip to content

Commit

Permalink
Move default cache location to /tmp/nxontology-ml/cache
Browse files Browse the repository at this point in the history
  • Loading branch information
yonromai committed Oct 20, 2023
1 parent 1b59233 commit 834a3fa
Show file tree
Hide file tree
Showing 25 changed files with 150 additions and 124 deletions.
2 changes: 2 additions & 0 deletions experimentation/embeddings_warmup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
DEFAULT_EMBEDDING_MODEL,
AutoModelEmbeddings,
)
from nxontology_ml.utils import CACHE_DIR


def warmup_cache(
Expand All @@ -14,6 +15,7 @@ def warmup_cache(
# Warm up the embedding cache
ame = ame or AutoModelEmbeddings.from_pretrained(
DEFAULT_EMBEDDING_MODEL,
cache_dir=CACHE_DIR,
)
nxo = get_efo_otar_slim()
X, _ = read_training_data(take=take)
Expand Down
7 changes: 1 addition & 6 deletions experimentation/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,7 @@ def run_experiments(
SubsetsFeatures(enabled=exp.subsets_enabled),
TherapeuticAreaFeatures(enabled=exp.ta_enabled),
GptTagFeatures.from_config(exp.gpt_tagger_config),
TextEmbeddingsTransformer.from_config(
enabled=exp.embedding_enabled,
pca_components=exp.pca_components,
use_lda=exp.use_lda,
embedding_model=ame,
),
TextEmbeddingsTransformer.from_config(conf=exp, embedding_model=ame),
CatBoostDataFormatter(),
)
mmb.steps_from_pipeline(feature_pipeline)
Expand Down
3 changes: 1 addition & 2 deletions experimentation/tests/embeddings_warmup_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
def test_warmup_cache() -> None:
ame = AutoModelEmbeddings.from_pretrained(
DEFAULT_EMBEDDING_MODEL,
cache_path=ROOT_DIR
/ "nxontology_ml/text_embeddings/tests/test_resources/embeddings_cache.ldb",
cache_dir=ROOT_DIR / "nxontology_ml/text_embeddings/tests/test_resources",
)
warmup_cache(ame=ame, take=10)
assert dict(ame._counter) == {"AutoModelEmbeddings/CACHE_HIT": 10}
6 changes: 4 additions & 2 deletions experimentation/tests/gpt_tags_warmup_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from experimentation.gpt_tags_warmup import warmup_gpt_tags
from nxontology_ml.gpt_tagger import TaskConfig
from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger


def test_warmup_gpt_tags() -> None:
def test_warmup_gpt_tags(precision_config: TaskConfig) -> None:
tagger = mk_test_gpt_tagger(
config=precision_config,
cache_content={
"/a93f3eabc24f867ae4f1d6b371ba6734e38ea0a4": b'["medium"]',
}
},
)
warmup_gpt_tags(
tagger=tagger,
Expand Down
4 changes: 3 additions & 1 deletion experimentation/tests/model_runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@


def test_run_experiments(tmp_path: Path) -> None:
ame = AutoModelEmbeddings.from_pretrained(DEFAULT_EMBEDDING_MODEL)
ame = AutoModelEmbeddings.from_pretrained(
DEFAULT_EMBEDDING_MODEL, cache_dir=tmp_path
)
experiments = [
ModelConfig(
eval_metric="BiasedMaeMetric",
Expand Down
7 changes: 2 additions & 5 deletions nxontology_ml/gpt_tagger/_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from nxontology_ml.gpt_tagger._models import TaskConfig
from nxontology_ml.gpt_tagger._utils import config_to_cache_namespace, counter_or_empty
from nxontology_ml.utils import ROOT_DIR


class _Cache:
Expand Down Expand Up @@ -70,12 +69,10 @@ def from_config(
cls,
config: TaskConfig,
counter: Counter[str] | None = None,
cache_path: Path | None = None,
) -> "_Cache":
cache_namespace = config_to_cache_namespace(config)
if not cache_path:
cache_path = ROOT_DIR / f".cache/{cache_namespace}.ldb"
cache_path.parent.mkdir(parents=True, exist_ok=True)
config.cache_dir.mkdir(parents=True, exist_ok=True)
cache_path = config.cache_dir / f"{cache_namespace}.ldb"
return cls(
storage=LazyLSM(cache_path.as_posix()),
namespace="", # Namespace is already part of the storage path
Expand Down
4 changes: 3 additions & 1 deletion nxontology_ml/gpt_tagger/_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from dataclasses import dataclass
from pathlib import Path

from nxontology_ml.utils import ROOT_DIR
from nxontology_ml.utils import CACHE_DIR, ROOT_DIR

LOG_DIR = ROOT_DIR / "logs/openai-api"

Expand Down Expand Up @@ -70,6 +70,8 @@ class TaskConfig:
# Optionally persist logs to disk
logs_path: Path | None = LOG_DIR

cache_dir: Path = CACHE_DIR


@dataclass
class LabelledNode:
Expand Down
33 changes: 19 additions & 14 deletions nxontology_ml/gpt_tagger/tests/_cache_test.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,38 @@
from pathlib import Path
from tempfile import TemporaryDirectory

from nxontology_ml.gpt_tagger import TaskConfig
from nxontology_ml.gpt_tagger._cache import LazyLSM, _Cache
from nxontology_ml.gpt_tagger.tests._utils import precision_config
from nxontology_ml.utils import ROOT_DIR


def test_from_config() -> None:
expected_cache_path = ROOT_DIR / ".cache/precision_v1_n1.ldb"
cache = _Cache.from_config(precision_config)
config = TaskConfig(
name="precision",
prompt_path=ROOT_DIR / "prompts/precision_v1.txt",
openai_model_name="gpt-4",
node_attributes=["efo_id", "efo_label", "efo_definition"],
model_n=3,
)
expected_cache_path = Path("/tmp/nxontology-ml/cache/precision_v1_n3.ldb")
cache = _Cache.from_config(config)
assert isinstance(cache._storage, LazyLSM)
assert Path(cache._storage._filename) == expected_cache_path
assert cache._key_hash_fn == "sha1"
assert cache._namespace == ""


def test_main() -> None:
with TemporaryDirectory() as tmpdir:
cache_path = Path(tmpdir) / "precision_v1_n1.ldb"
cache = _Cache.from_config(precision_config, cache_path=cache_path)
def test_main(precision_config: TaskConfig) -> None:
cache = _Cache.from_config(precision_config)

assert cache.get("KEY", "DEFAULT") == "DEFAULT"
cache["KEY"] = "value"
assert cache.get("KEY", "DEFAULT") == "value"
assert cache.get("KEY", "DEFAULT") == "DEFAULT"
cache["KEY"] = "value"
assert cache.get("KEY", "DEFAULT") == "value"

cache2 = _Cache.from_config(precision_config, cache_path=cache_path)
cache2["KEY"] = "value"
del cache2["KEY"]
assert cache2.get("KEY", "DEFAULT") == "DEFAULT"
cache2 = _Cache.from_config(precision_config)
cache2["KEY"] = "value"
del cache2["KEY"]
assert cache2.get("KEY", "DEFAULT") == "DEFAULT"


def test_LazyLSM() -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest
from _pytest._py.path import LocalPath

from nxontology_ml.gpt_tagger import TaskConfig
from nxontology_ml.gpt_tagger._chat_completion_middleware import (
_ChatCompletionMiddleware,
)
Expand All @@ -19,7 +20,6 @@
from nxontology_ml.gpt_tagger._utils import node_to_str_fn
from nxontology_ml.gpt_tagger.tests._utils import (
mk_stub_ccm,
precision_config,
sanitize_json_format,
)
from nxontology_ml.tests.utils import get_test_nodes, read_test_resource
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_ctor_verify() -> None:
_mk_test_ccm(prompt_template="foo")


def test_create(tmpdir: LocalPath) -> None:
def test_create(tmpdir: LocalPath, precision_config: TaskConfig) -> None:
logdir = Path(tmpdir) / "logs"
config = copy(precision_config)
config.model_temperature = 1
Expand Down Expand Up @@ -103,7 +103,7 @@ def test_create(tmpdir: LocalPath) -> None:
assert sanitize_json_format(resp_file.read_text()) == json_resp


def test_from_config() -> None:
def test_from_config(precision_config: TaskConfig) -> None:
ccm = _ChatCompletionMiddleware.from_config(precision_config)
assert ccm._partial_payload["model"] == "gpt-3.5-turbo"
assert ccm._partial_payload["messages"][0]["content"] == "__PLACEHOLDER__"
20 changes: 14 additions & 6 deletions nxontology_ml/gpt_tagger/tests/_features_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

from nxontology_ml.data import get_efo_otar_slim
from nxontology_ml.features import PrepareNodeFeatures
from nxontology_ml.gpt_tagger import GptTagger
from nxontology_ml.gpt_tagger import GptTagger, TaskConfig
from nxontology_ml.gpt_tagger._features import DEFAULT_CONF, GptTagFeatures
from nxontology_ml.gpt_tagger._openai_models import Response
from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger, precision_config
from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger
from nxontology_ml.sklearn_transformer import NodeFeatures
from nxontology_ml.tests.utils import read_test_resource
from nxontology_ml.utils import ROOT_DIR
Expand All @@ -27,13 +27,19 @@ def sampled_nxo() -> NXOntology[str]:


@pytest.fixture
def tagger() -> GptTagger:
def tagger(precision_config: TaskConfig) -> GptTagger:
expected_req = read_test_resource("precision_payload.json")
stub_resp = Response(**json.loads(read_test_resource("precision_resp.json"))) # type: ignore[misc]
return mk_test_gpt_tagger(stub_content={expected_req: stub_resp}, cache_content={})
return mk_test_gpt_tagger(
config=precision_config,
stub_content={expected_req: stub_resp},
cache_content={},
)


def test_transform(tagger: GptTagger, sampled_nxo: NXOntology[str]) -> None:
def test_transform(
tagger: GptTagger, sampled_nxo: NXOntology[str], precision_config: TaskConfig
) -> None:
p = make_pipeline(
PrepareNodeFeatures(sampled_nxo),
GptTagFeatures(
Expand All @@ -50,7 +56,9 @@ def test_transform(tagger: GptTagger, sampled_nxo: NXOntology[str]) -> None:
assert_frame_equal(df, expected_df)


def test_disabled(tagger: GptTagger, sampled_nxo: NXOntology[str]) -> None:
def test_disabled(
tagger: GptTagger, sampled_nxo: NXOntology[str], precision_config: TaskConfig
) -> None:
p = make_pipeline(
PrepareNodeFeatures(sampled_nxo),
GptTagFeatures(
Expand Down
34 changes: 19 additions & 15 deletions nxontology_ml/gpt_tagger/tests/_gpt_tagger_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@

from nxontology_ml.data import get_efo_otar_slim
from nxontology_ml.gpt_tagger._gpt_tagger import GptTagger
from nxontology_ml.gpt_tagger._models import LabelledNode
from nxontology_ml.gpt_tagger._models import LabelledNode, TaskConfig
from nxontology_ml.gpt_tagger._openai_models import Response
from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger, precision_config
from nxontology_ml.gpt_tagger.tests._utils import mk_test_gpt_tagger
from nxontology_ml.tests.utils import get_test_nodes, read_test_resource


def test_fetch_labels() -> None:
def test_fetch_labels(precision_config: TaskConfig) -> None:
cache_content: dict[str, bytes] = {}
tagger = mk_test_gpt_tagger(cache_content)
tagger = mk_test_gpt_tagger(precision_config, cache_content)
labels = tagger.fetch_labels(get_test_nodes())
assert list(labels) == [
LabelledNode(node_efo_id="DOID:0050890", labels=["medium"]),
Expand All @@ -41,13 +41,13 @@ def test_fetch_labels() -> None:
}


def test_fetch_labels_cached() -> None:
def test_fetch_labels_cached(precision_config: TaskConfig) -> None:
# Pre-loaded cache
cache_content = {
"/7665404d4f2728a09ed26b8ebf2b3be612bd7da2": b'["medium"]',
"/962b25d69f79f600f23a17e2c3fe79948013b4de": b'["medium"]',
}
tagger = mk_test_gpt_tagger(cache_content)
tagger = mk_test_gpt_tagger(precision_config, cache_content)
labels = tagger.fetch_labels(get_test_nodes())
assert list(labels) == [
LabelledNode(node_efo_id="DOID:0050890", labels=["medium"]),
Expand All @@ -56,13 +56,13 @@ def test_fetch_labels_cached() -> None:
assert tagger.get_metrics() == Counter({"Cache/get": 2, "Cache/hits": 2})


def test_fetch_many_records() -> None:
def test_fetch_many_records(precision_config: TaskConfig) -> None:
# Disable caching
class PassthroughDict(dict[str, bytes]):
def __setitem__(self, key: str, value: bytes) -> None:
return

tagger = mk_test_gpt_tagger(cache_content=PassthroughDict())
tagger = mk_test_gpt_tagger(precision_config, cache_content=PassthroughDict())

def _r(n: int) -> Response:
r = json.loads(read_test_resource("precision_resp.json"))
Expand Down Expand Up @@ -92,8 +92,8 @@ def _r(n: int) -> Response:
)


def test_get_metrics() -> None:
tagger = mk_test_gpt_tagger(cache_content={})
def test_get_metrics(precision_config: TaskConfig) -> None:
tagger = mk_test_gpt_tagger(precision_config, cache_content={})
tagger._counter["test"] += 42

# Defensive copy: No effect
Expand All @@ -107,7 +107,7 @@ def test_get_metrics() -> None:
assert tagger.get_metrics() == Counter({"test": 43})


def test_from_config() -> None:
def test_from_config(precision_config: TaskConfig) -> None:
counter: Counter[str] = Counter()
tagger = GptTagger.from_config(precision_config, counter=counter)

Expand All @@ -118,13 +118,15 @@ def test_from_config() -> None:
assert id(tagger._cache._counter) == counter_id


def test_resp_truncated() -> None:
def test_resp_truncated(precision_config: TaskConfig) -> None:
stub_resp = Response(**json.loads(read_test_resource("precision_resp.json"))) # type: ignore[misc]
assert stub_resp["choices"][0]["finish_reason"] == "stop"
stub_resp["choices"][0]["finish_reason"] = "length" # Simulate resp truncation
expected_req = read_test_resource("precision_payload.json")
tagger = mk_test_gpt_tagger(
stub_content={expected_req: stub_resp}, cache_content={}
config=precision_config,
stub_content={expected_req: stub_resp},
cache_content={},
)
with pytest.raises(
ValueError,
Expand All @@ -142,11 +144,13 @@ def _assert_user_warning_starts_with(warn: WarningMessage, s: str) -> None:
assert warn_msg.startswith(s)


def test_resp_id_mismatch() -> None:
def test_resp_id_mismatch(precision_config: TaskConfig) -> None:
expected_req = read_test_resource("mismatch_payload.json")
stub_resp = Response(**json.loads(read_test_resource("mismatch_resp.json"))) # type: ignore[misc]
tagger = mk_test_gpt_tagger(
stub_content={expected_req: stub_resp}, cache_content={}
config=precision_config,
stub_content={expected_req: stub_resp},
cache_content={},
)
nxo = get_efo_otar_slim()
valid_resp_node = "DOID:0050890"
Expand Down
3 changes: 1 addition & 2 deletions nxontology_ml/gpt_tagger/tests/_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
from nxontology_ml.gpt_tagger._models import TaskConfig
from nxontology_ml.gpt_tagger._openai_models import Response
from nxontology_ml.gpt_tagger._utils import node_to_str_fn
from nxontology_ml.gpt_tagger.tests._utils import precision_config
from nxontology_ml.tests.utils import get_test_nodes, read_test_resource
from nxontology_ml.utils import ROOT_DIR


@pytest.mark.skip(reason="IT: Makes a real openai api call")
def test_chat_completion_precision_it() -> None:
def test_chat_completion_precision_it(precision_config: TaskConfig) -> None:
# NOTE: Flaky API response, even with temp=0 :(
# NOTE: Needs an OPENAI_API_KEY setup, see main README.md
ccm = _ChatCompletionMiddleware.from_config(precision_config)
Expand Down
6 changes: 4 additions & 2 deletions nxontology_ml/gpt_tagger/tests/_tiktoken_batcher_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import tiktoken
from tiktoken import Encoding

from nxontology_ml.gpt_tagger import TaskConfig
from nxontology_ml.gpt_tagger._openai_models import _4K
from nxontology_ml.gpt_tagger._tiktoken_batcher import _TiktokenBatcher
from nxontology_ml.gpt_tagger.tests._utils import precision_config
from nxontology_ml.tests.utils import get_test_resource_path


Expand Down Expand Up @@ -54,7 +54,9 @@ def test_add_tokens(tiktoken_cl100k_encoding: Encoding) -> None:
batcher._do_add_record_to_buffer(record)


def test_from_config(tiktoken_cl100k_encoding: Encoding) -> None:
def test_from_config(
tiktoken_cl100k_encoding: Encoding, precision_config: TaskConfig
) -> None:
# Valid config
batcher = _TiktokenBatcher.from_config(precision_config)
assert batcher._tiktoken_encoding == tiktoken_cl100k_encoding
Expand Down
Loading

0 comments on commit 834a3fa

Please sign in to comment.