valory-xyz · 0xArdi · Aug 7, 2024 · Apr 24, 2024 · Apr 29, 2024 · May 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -54,4 +54,6 @@ backup_mech/
 /packages/valory/skills/termination_abci/
 /pip
 /tool_test.py
-.venv
+.venv
+log
+.benchmark-cache
diff --git a/packages/kongzii/customs/ofv_market_resolver/benchmark.py b/packages/kongzii/customs/ofv_market_resolver/benchmark.py
@@ -0,0 +1,133 @@
+import typer
+import pandas as pd
+from packages.kongzii.customs.ofv_market_resolver.ofv_market_resolver import (
+    run as ofv_run,
+)
+from packages.napthaai.customs.resolve_market_reasoning.resolve_market_reasoning import (
+    Results,
+    run as original_run,
+)
+from pydantic import SecretStr, ValidationError
+from joblib import Memory
+
+# File cache to not re-run the same questions.
+MEMORY = Memory(".benchmark-cache", verbose=0)
+APP = typer.Typer()
+
+ofv_run_cached = MEMORY.cache(ofv_run)
+
+
+@MEMORY.cache
+def run_original_resolver_cached(
+    question: str,
+    openai_api_key: SecretStr,
+    google_api_key: SecretStr,
+    google_engine_id: SecretStr,
+) -> bool | None:
+    try:
+        dump = original_run(
+            api_keys={
+                "openai": openai_api_key.get_secret_value(),
+                "google_api_key": google_api_key.get_secret_value(),
+                "google_engine_id": google_engine_id.get_secret_value(),
+            },
+            tool="resolve-market-reasoning-gpt-4",
+            prompt=question,
+        )[0]
+        return Results.model_validate_json(dump).has_occurred
+    except ValueError:
+        return None
+
+
+@APP.command()
+def full(
+    data_path: str,
+    openai_api_key: str,
+    serper_api_key: str,
+    google_api_key: str,
+    google_engine_id: str,
+) -> None:
+    """
+    Will run the prediction market resolver on all provided data and compare the results.
+
+    Expects a tsv file with columns:
+        - question
+        - resolution (YES/NO, as currently resolved on Omen)
+        - my_resolution (YES/NO, as resolved manually by you, used as ground truth)
+
+    Example command:
+
+    ```
+    python packages/kongzii/customs/ofv_market_resolver/benchmark.py full markets.tsv {openai api key} {serper api key} {google api key} {google engine id}
+    ```
+    """
+    df = pd.read_csv(data_path, sep="\t")
+
+    # Run the resolution on all the data.
+    df["ofv_resolution"] = df["question"].apply(
+        lambda q: ofv_run_cached(
+            q,
+            openai_api_key=SecretStr(openai_api_key),
+            serper_api_key=SecretStr(serper_api_key),
+        )
+    )
+    df["new_original_resolution"] = df["question"].apply(
+        lambda q: run_original_resolver_cached(
+            q,
+            openai_api_key=SecretStr(openai_api_key),
+            google_api_key=SecretStr(google_api_key),
+            google_engine_id=SecretStr(google_engine_id),
+        )
+    )
+    # Normalise boolean to YES/NO/None.
+    df["ofv_resolution"] = df["ofv_resolution"].apply(
+        lambda r: "None" if r is None else "YES" if r else "NO"
+    )
+    df["new_original_resolution"] = df["new_original_resolution"].apply(
+        lambda r: "None" if r is None else "YES" if r else "NO"
+    )
+    # Save all the predictions and separatelly these that are incorrect.
+    df.to_csv("markets_resolved.tsv", sep="\t", index=False)
+    df[df["ofv_resolution"] != df["my_resolution"]].to_csv(
+        "markets_resolved_incorretly_by_ofv.tsv", sep="\t", index=False
+    )
+
+    # Calculate the accuracy.
+    accuracy_current = sum(df["resolution"] == df["my_resolution"]) / len(df)
+    accuracy_new_original = sum(
+        df["new_original_resolution"] == df["my_resolution"]
+    ) / len(df)
+    accuracy_ofv = sum(df["ofv_resolution"] == df["my_resolution"]) / len(df)
+    print(
+        f"""
+Current accuracy: {accuracy_current*100:.2f}%
+Original's new run accuracy: {accuracy_new_original * 100:.2f}
+OFV's accuracy: {accuracy_ofv*100:.2f}%
+"""
+    )
+
+
+@APP.command()
+def single(
+    question: str,
+    openai_api_key: str,
+    serper_api_key: str,
+) -> None:
+    """
+    Will run the prediction market resolver and print the result on a single question.
+
+    Example command:
+
+    ```
+    python packages/kongzii/customs/ofv_market_resolver/benchmark.py single "Will McDonald's successfully buy back all its Israeli restaurants by 12 April 2024?" {openai api key} {serper api key}
+    ```
+    """
+    ofv_run(
+        question,
+        openai_api_key=SecretStr(openai_api_key),
+        serper_api_key=SecretStr(serper_api_key),
+    )
+
+
+if __name__ == "__main__":
+    APP()
diff --git a/packages/kongzii/customs/ofv_market_resolver/ofv_market_resolver.py b/packages/kongzii/customs/ofv_market_resolver/ofv_market_resolver.py
@@ -0,0 +1,178 @@
+from factcheck import FactCheck
+from factcheck.utils.multimodal import modal_normalization
+from langchain_openai import ChatOpenAI
+from typing import Annotated
+from pydantic import SecretStr, BaseModel, BeforeValidator
+
+DEFAULT_OPENAI_MODEL = "gpt-4-0125-preview"
+
+Factuality = Annotated[
+    bool | None,
+    BeforeValidator(lambda v: None if v in ("Nothing to check.", "non-factual") else v),
+]
+
+
+class FactCheckClaimDetails(BaseModel):
+    claim: str
+    factuality: Factuality
+    correction: str | None
+    reference_url: str
+
+
+class FactCheckResult(BaseModel):
+    factuality: Factuality
+    claims_details: list[FactCheckClaimDetails] | None
+
+
+def factcheck(
+    statement: str,
+    model: str = DEFAULT_OPENAI_MODEL,
+    openai_api_key: SecretStr | None = None,
+    serper_api_key: SecretStr | None = None,
+) -> FactCheckResult:
+    api_config = {
+        "OPENAI_API_KEY": openai_api_key.get_secret_value(),
+        "SERPER_API_KEY": serper_api_key.get_secret_value(),
+    }
+    factcheck = FactCheck(
+        default_model=model,
+        api_config=api_config,
+        retriever="serper",
+        num_seed_retries=5,
+    )
+    content = modal_normalization("string", statement)
+    res = factcheck.check_response(content)
+
+    return FactCheckResult.model_validate(res)
+
+
+def rewrite_as_sentence(
+    question: str,
+    model: str = DEFAULT_OPENAI_MODEL,
+    openai_api_key: SecretStr | None = None,
+) -> str:
+    """
+    Rewrites the question into a sentence, example:
+
+    `Will former Trump Organization CFO Allen Weisselberg be sentenced to jail by 15 April 2024?`
+    ->
+    `Former Trump Organization CFO Allen Weisselberg was sentenced to jail by 15 April 2024.`
+    """
+    llm = ChatOpenAI(
+        model=model, temperature=0.0, api_key=openai_api_key.get_secret_value()
+    )
+
+    prompt = f"""
+Rewrite the question into a simple annoucment sentence stating a fact or prediction like it is already known.  
+Make future tense into past tense.
+For future questions that ask if something will happen "by" some date, rewrite it to "before" that date or any time sooner.
+For future questions that ask if something will happen "on" some date, rewrite it to "on" that date.
+If the question is both "on" and "by" some date, rewrite it as "before or any time sooner than" that date.
+If the question is about exact date, keep it exact. 
+If the question is about a date range, keep it a range.
+Always keep the same meaning.                          
+Never negate the sentence into opposite meaning of the question.                  
+
+Question: {question}
+Sentence:                                         
+"""
+    completion = str(llm.invoke(prompt, max_tokens=512).content)
+
+    return completion
+
+
+# TODO: This could be imported from prediction-market-agent-tooling, but given the conflict in the langchain versions,
+# it would require changes in other mechs of this repository.
+def is_predictable_binary(
+    question: str,
+    model: str = DEFAULT_OPENAI_MODEL,
+    openai_api_key: SecretStr | None = None,
+) -> str:
+    """
+    Evaluate if the question is actually answerable.
+    """
+    llm = ChatOpenAI(
+        model=model, temperature=0.0, api_key=openai_api_key.get_secret_value()
+    )
+
+    prompt = f"""Main signs about a fully qualified question (sometimes referred to as a "market"):
+- The market's question needs to be specific, without use of pronouns.
+- The market's question needs to have a clear future event.
+- The market's question needs to have a clear time frame.
+- The event in the market's question doesn't have to be ultra-specific, it will be decided by a crowd later on.
+- If the market's question contains date, but without an year, it's okay.
+- If the market's question contains year, but without an exact date, it's okay.
+- The market's question can not be about itself or refer to itself.
+- The answer is probably Google-able, after the event happened.
+- The potential asnwer can be only "Yes" or "No".
+
+Follow a chain of thought to evaluate if the question is fully qualified:
+
+First, write the parts of the following question:
+
+"{question}"
+
+Then, write down what is the future event of the question, what it refers to and when that event will happen if the question contains it.
+
+Then, explain why do you think it is or isn't fully qualified.
+
+Finally, write your final decision, write `decision: ` followed by either "yes it is fully qualified" or "no it isn't fully qualified" about the question. Don't write anything else after that. You must include "yes" or "no".
+"""
+    completion = str(llm.invoke(prompt, max_tokens=512).content)
+
+    try:
+        decision = completion.lower().rsplit("decision", 1)[1]
+    except IndexError as e:
+        raise ValueError(
+            f"Invalid completion in is_predictable for `{question}`: {completion}"
+        ) from e
+
+    if "yes" in decision:
+        is_predictable = True
+    elif "no" in decision:
+        is_predictable = False
+    else:
+        raise ValueError(
+            f"Invalid completion in is_predictable for `{question}`: {completion}"
+        )
+
+    return is_predictable
+
+
+def run(
+    market_question: str,
+    openai_api_key: SecretStr | None = None,
+    serper_api_key: SecretStr | None = None,
+) -> bool | None:
+    """
+    Run the prediction market resolver based on Open Fact Verifier.
+
+    Returns:
+        - None if can't decide
+        - True if the answer for the question is "Yes"
+        - False if the answer for the question is "No"
+    """
+    # Check if the question is reasonable to look for an answer.
+    is_answerable = is_predictable_binary(
+        market_question, openai_api_key=openai_api_key
+    )
+    if not is_answerable:
+        print(
+            f"Question `{market_question}` is not answerable, skipping fact checking."
+        )
+        return None
+
+    # Rewrite the question (which was about a future) into a sentence (which is about the past).
+    market_sentence = rewrite_as_sentence(
+        market_question, openai_api_key=openai_api_key
+    )
+    print(f"Question `{market_question}` rewritten into `{market_sentence}`.")
+    # Fact-check the sentence.
+    factresult = factcheck(
+        market_sentence, openai_api_key=openai_api_key, serper_api_key=serper_api_key
+    )
+    print(
+        f"Fact check result for `{market_sentence}` is `{factresult.factuality}`, because {factresult.claims_details}."
+    )
+
+    return factresult.factuality