diff --git a/src/aviary/core.py b/src/aviary/core.py index 587e76b5..6170a7bd 100644 --- a/src/aviary/core.py +++ b/src/aviary/core.py @@ -40,6 +40,7 @@ EvalAnswerMode, encode_image_to_base64, eval_answer, + extract_answer, is_coroutine_callable, partial_format, ) @@ -82,6 +83,7 @@ "encode_image_to_base64", "eval_answer", "eval_answer", + "extract_answer", "fenv", "is_coroutine_callable", "join", diff --git a/src/aviary/utils.py b/src/aviary/utils.py index 35687bd2..b495f18b 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -3,10 +3,9 @@ import inspect import io import random -import re import string from ast import literal_eval -from collections.abc import Awaitable, Callable, Sequence +from collections.abc import Sequence from enum import StrEnum from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast @@ -21,8 +20,8 @@ import numpy as np -DEFAULT_EVAL_MODEL_NAME = "gpt-4o" -LLM_BOOL_EVAL_CONFIG = { +DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini" +LLM_BOOL_EVAL_CONFIG: dict[str, Any] = { "prompt": ( "Here is a question, the correct answer to the question, and a proposed answer" " to the question. Please tell me if the proposed answer is correct, given the" @@ -35,6 +34,18 @@ "temperature": 0, } +LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | { + "prompt": ( + "You are evaluating answers for a test which has fixed options. " + "Repeat back which option the proposed answer matches. " + "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. " + "If the proposed answer is empty, invalid, or ambiguous, " + "return an empty string." + "\n\nOptions:\n{options}" + "\n\nProposed answer: {proposed_answer}" + ) +} + LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | { "prompt": ( "Here is a question, the correct answer to the question, and a rubric for" @@ -175,21 +186,36 @@ async def eval_answer( raise RuntimeError(f"Invalid evaluation mode: {eval_mode}") +async def extract_answer( + proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None +) -> str | None: + """Extract the answer matching a proposal from a list of options using an LLM.""" + for option in options: + if proposed_answer.strip().casefold() == option.strip().casefold(): + return option + + default_config = LLM_EXTRACT_CONFIG + config = llm_eval_config or default_config + response_msg = await run_prompt( + prompt=config.get("prompt", default_config["prompt"]).format( + options="\n".join(options), + proposed_answer=proposed_answer, + ), + model=config.get("model", default_config["model"]), + temperature=config.get("temperature", default_config["temperature"]), + ) + answer = response_msg.strip().casefold() # noqa: FURB184 + for option in options: + if answer == option.strip().casefold(): + return option + return None + + _CAPITAL_A_INDEX = ord("A") class MultipleChoiceQuestion(BaseModel): QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}" - # TODO: combine with above eval_answer and its prompts - EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = ( - "Given the following question and a proposed answer to the question, return the" - " single-letter choice in the question that matches the proposed answer." - " If the proposed answer is blank or an empty string," - " or multiple options are matched, respond with '0'." - "\n\nQuestion: {qa_prompt}" - "\n\nProposed Answer: {qa_answer}" - "\n\nSingle Letter Answer:" - ) DEFAULT_UNSURE_OPTION: ClassVar[str] = ( "Insufficient information to answer this question" ) @@ -280,18 +306,14 @@ def split_options(options: str) -> list[str]: return split_options async def grade( - self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None - ) -> "tuple[MultipleChoiceEvaluation, str, str]": - if prompt_runner is None: - prompt_runner = run_prompt - eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format( - qa_prompt=self.question_prompt, qa_answer=answer - ) - raw_evaluation = await prompt_runner(eval_prompt) - evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer( - raw_evaluation, self + self, proposed_answer: str + ) -> "tuple[MultipleChoiceEvaluation, str | None]": + extracted_answer = await extract_answer( + proposed_answer=proposed_answer, options=self.options ) - return evaluation, raw_evaluation, parsed_answer + return MultipleChoiceEvaluation.from_answer( + extracted_answer, self + ), extracted_answer class MultipleChoiceEvaluation(StrEnum): @@ -323,32 +345,19 @@ def calculate_accuracy_precision( @classmethod def from_answer( - cls, answer: str, question: MultipleChoiceQuestion - ) -> "tuple[MultipleChoiceEvaluation, str]": + cls, extracted_answer: str | None, question: MultipleChoiceQuestion + ) -> "MultipleChoiceEvaluation": """Make an evaluation from the input answer and multiple choice question. Returns: - Two-tuple of answer enum and the raw answer extracted from the input answer. + Evaluation corresponding to the parsed answer. """ - # SEE: https://regex101.com/r/vcE9Hb/1 - letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL) - # Get the letter answer, or fail over to the first non-whitespace char - answer_char = ( - letter_search.group(1) - if letter_search is not None - else answer.split()[0][0].upper() - ) - answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX - if answer_letter_index < 0 or answer_letter_index > len(question.options): - # The result extracted was not in the options (e.g. '0') - return cls.INCORRECT, answer_char + if extracted_answer is None: + return MultipleChoiceEvaluation.INCORRECT # From here, if we don't match either the ideal or the unsure multiple choice # options then we declare the answer as incorrect. - if ( - question.unsure_answer_index is not None - and answer_letter_index == question.unsure_answer_index - ): - return cls.UNSURE, cast(str, question.unsure_answer) - if answer_letter_index == question.ideal_answer_index: - return cls.CORRECT, question.ideal_answer - return cls.INCORRECT, question.options[answer_letter_index] + if extracted_answer == question.ideal_answer: + return MultipleChoiceEvaluation.CORRECT + if question.unsure_answer and extracted_answer == question.unsure_answer: + return MultipleChoiceEvaluation.UNSURE + return MultipleChoiceEvaluation.INCORRECT diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml index be2df091..617568d2 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle - Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information + to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role": + "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "513" + - "442" content-type: - application/json host: @@ -36,7 +35,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k - u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw - 47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK - RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie - H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu - tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6 - AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA + H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da + tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX + wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr + +ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2 + EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI + XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH + wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fde1cf88cf1b-SJC + - 8f425bb2ac70f953-SJC Connection: - keep-alive Content-Encoding: @@ -65,9 +64,15 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:29 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -79,25 +84,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "363" + - "144" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999874" + - "149999896" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_aff8daa48aa43d3df077f97da6136e5a + - req_503cd8163bd0d3b634eb723d6874b1da status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml index 38077163..77357e4c 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 14004\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 14004", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "536" + - "464" content-type: - application/json host: @@ -36,7 +35,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLT8MwEITv+RWWzw1KS2lCb3BCSDykcuAhFBl7kxocr2Vveajqf0dO - 2yQVIHHJYb6dyewm64QxrhWfMy6XgmTjTHpWPV6fFzf1Jcwebu4WF4tbunKFwvzj/n3GR9GBL68g - ae86ktg4A6TRbrH0IAhi6jg/nk7zLC9OWtCgAhNttaN0iukkm0zTrEizXa5copYQ+Jw9JYwxtm6f - saJV8MnnLBvtlQZCEDXweTfEGPdoosJFCDqQsMRHPZRoCWzbOhvqHqpVELGWXRmz0zfdiwzWzuNL - 2PFOr7TVYVl6EAFtDA2Ejrd0kzD23C60OujIncfGUUn4BjYGjscn2zzen3BAd4yQhBmaZqNf4koF - JLQJg4twKeQSVG/tzydWSuMAJIOlf5b5LXu7uLb1f+J7ICU4AlU6D0rLw4X7MQ/xB/trrDtyW5iH - r0DQlJW2NXjn9fYbV64U+akqlJDjiieb5BsAAP//AwBRMcSQ7AIAAA== + H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyqB8MoNoR7aAxdKVamqImNvElPHtmyjPhD/XjlQ + AoJKvfgwszOeWXsXEQKCQ06A1dSzxsh4Vn7M71fZMntcjAbfz6uXjTSbh9mCPk2Xc+gFhV5vkPlf + 1R3TjZHohVYHmlmkHoNrOh5kw+FkmE5aotEcZZBVxseZjhuhRNxP+lmcjON0clTXWjB0kJPXiBBC + du0ZciqOn5CTpPeLNOgcrRDy0xAhYLUMCFDnhPNUeeh1JNPKo2qjn8MWy62jIZraSnnE96d7pK6M + 1Wt35E94KZRwdWGROq2Cp/PaQMvuI0Le2j7bi4hgrG6ML7x+RxUMp+nBDrotduSxKnjtqbyhuTAr + OHoqpDtbBzDKauRXhoQA3XKhz4jorPJ1llveh9pCVf+x7wjG0HjkhbHIBbvZtzUPX+yvsdOK28Dg + vpzHpiiFqtAaKw4PXJpiVLI0wTTBNUT76AcAAP//AwBkI2np7gIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdc63fbf9e53-SJC + - 8f425bb11b702519-SJC Connection: - keep-alive Content-Encoding: @@ -65,9 +64,15 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:25 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=6j4w6Jnsg0wGsZf61WcNCvHdr1Vcb6uVLFFhTQQgcv4-1734558518-1.0.1.1-D0vsT8nCM66xiA.Xa6ijXpgeGPM65Iux2KhQqUiD8wToq.VmwT03dnkmELw1qn0GvHJvh8g7H6WkqYzXVgs2Xg; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=LFVOxysXKxTPNQ2KK05aqbBnIRDPc45hskCPkFcOjXA-1734558518178-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -79,25 +84,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "212" + - "131" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999868" + - "149999890" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_afd8c66d84f3b42a8cd2b8a6bf855054 + - req_12c5e1cdb8b2ba32b075f04f20194421 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml index 057ef1d0..6865d713 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: \n\nSingle - Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: ", "role": + "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "517" + - "445" content-type: - application/json host: @@ -46,18 +45,16 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFLLTsMwELznKyyfG+S2gaa5FegFJKRKnEAocp1NaurYlr3hoar/jpyG - JBUgcfFhZmc8O/YhIoTKgmaEih1HUVsVr8qnh/XbzfvdI7LN7WzF9vf8Mik31xtYN3QSFGb7CgK/ - VRfC1FYBSqNPtHDAEYLrdDFPkgVbpGlL1KYAFWSVxTgx8YzNkpilMbvqhDsjBXiakeeIEEIO7Rki - 6gI+aEbY5BupwXteAc36IUKoMyoglHsvPXKNdDKQwmgE3aZmY9xB2XgeYulGqQ4/9hcpU1lntr7j - e7yUWvpd7oB7o4OpR2Npyx4jQl7ahZqzjNQ6U1vM0exBB8MpW5786FDhiO04NMjVCJ52LZzb5QUg - l8qPGqGCix0Ug3SojzeFNCMiGi39M8xv3qfFpa7+Yz8QQoBFKHLroJDifOFhzEH4YH+N9SW3gan/ - 9Ah1XkpdgbNOnt64tPmy5Fu+LNk8pdEx+gIAAP//AwDTwVpp7AIAAA== + H4sIAAAAAAAAAwAAAP//jFJdS8MwFH3vrwj3eZV2X46+6RAR0T2JikjJkts2miYhSVEZ+++Srms3 + NsGXPJxzz8k5N9lEhIDgkBFgFfWsNjK+Kr6WN6vb8XL+9LqSxj3Qu2R1ff/8yCfiBUZBodcfyPxe + dcF0bSR6odWOZhapx+CaXk6ms9lili5aotYcZZCVxsdTHddCiXicjKdxchmni05dacHQQUbeIkII + 2bRnyKk4fkNGktEeqdE5WiJk/RAhYLUMCFDnhPNUeRgNJNPKo2qjH8IWi8bREE01Unb4tr9H6tJY + vXYd3+OFUMJVuUXqtAqezmsDLbuNCHlv+zRHEcFYXRufe/2JKhgu5js7GLY4kF1V8NpTeUZzZJZz + 9FRId7AOYJRVyE8MCQHacKEPiOig8mmWc9672kKV/7EfCMbQeOS5scgFO9u3NQ9f7K+xfsVtYHA/ + zmOdF0KVaI0VuwcuTD4vWJpgmuAaom30CwAA//8DAL0A1qzuAgAA headers: - CF-Cache-Status: - - DYNAMIC CF-RAY: - - 8f39fddcea1d251d-SJC + - 8f425bb5de5996de-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +62,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:28 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -76,28 +73,30 @@ interactions: - X-Request-ID alt-svc: - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC openai-organization: - future-house-xr4tdh openai-processing-ms: - - "174" + - "233" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999872" + - "149999896" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_4d40eb2c66dfd308a7b75c7cd80c405b + - req_0c845e0049332bd1fa73fdbe76005ea1 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml index a0acce15..4a0fa4ae 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: \n\nSingle Letter - Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information + to answer this question\ncheesecake\n11\n42\n\nProposed answer: ", "role": "user"}], + "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "511" + - "440" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJRT4MwFIXf+RVNn4eBicPx5oszWTJjfNBoDOnaC9SVtmlLMrPsv5sW - HCzOxBceznfP4dwLhwghzBkuEKYNcbTVIr6r3jar1eb5IXtKb7brPL9/3e/a5fxFrOtHPPMOtf0E - 6n5cV1S1WoDjSvaYGiAOfGqaX2dZnuTLJIBWMRDeVmsXZyqeJ/MsTm7jZDEYG8UpWFyg9wghhA7h - 6StKBntcoBATlBasJTXg4jSEEDZKeAUTa7l1RDo8GyFV0oEMrZOpbqDqLPG1ZCfEoB9PLxKq1kZt - 7cBPesUlt01pgFglfah1SuNAjxFCH2Gh7qwj1ka12pVO7UD6wDRZ9Hl4POGEDswpR8TUlM8uxJUM - HOHCTi6CKaENsNE6no90jKsJiCZL/y5zKbtfnMv6P/EjoBS0A1ZqA4zT84XHMQP+B/tr7HTkUBjb - L+ugLSsuazDa8P4bV7ok+ZLdMkLTCkfH6BsAAP//AwBwbnWk7AIAAA== + H4sIAAAAAAAAAwAAAP//jFJda8IwFH3vrwj32Q6tLTrfhjB8FhyyMUpMbttomoQk3Qfifx+ptXXo + YC95OOeek3NucowIAcFhQYBV1LPayPip+Fw+rz5e6221zta4fKENiu1qszkk2z2MgkLv9sj8RfXA + dG0keqHVmWYWqcfgOplN0yybZ5PHlqg1RxlkpfFxquNaKBEn4ySNx7N4Mu/UlRYMHSzIW0QIIcf2 + DDkVxy9YkPHogtToHC0RFv0QIWC1DAhQ54TzVHkYDSTTyqNqo1/DFovG0RBNNVJ2+Km/R+rSWL1z + Hd/jhVDCVblF6rQKns5rAy17igh5b/s0vyKCsbo2Pvf6gCoYztOzHQxbHMiuKnjtqbyj+WWWc/RU + SHe1DmCUVchvDAkB2nChr4joqvJtlnve59pClf+xHwjG0HjkubHIBbvbtzUPX+yvsX7FbWBw385j + nRdClWiNFecHLkzOxwnPppNdOoPoFP0AAAD//wMAMCnsc+4CAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fde81f05ceb1-SJC + - 8f425bb72f9b67dc-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:30 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -79,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "332" + - "532" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999875" + - "149999896" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_817ca7ae018d7baa48236c7ad4f4f151 + - req_ed9d0e7998f792094d5aefe723693f28 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml index d70cc972..f6e5e085 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml @@ -1,14 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What method - was used to demonstrate that the enzyme PafA is stable after incubation with - 4M urea for 14 days?\n\nOptions:\nA) cryo EM\nB) Insufficient information to - answer this question\nC) NMR\nD) x-ray crystallography\nE) circular dichroism\n\nProposed - Answer: \n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\ncryo EM\nInsufficient information + to answer this question\nNMR\nx-ray crystallography\ncircular dichroism\n\nProposed + answer: ", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -17,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "624" + - "472" content-type: - application/json host: @@ -47,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLNboMwEITvPIXlM1SEkJJyS9VIlfpz6SmpKuSYhTg1tmVvpEZR3r0y - IUDVVOqFw3w7w+zCMSCEipLmhPItQ94YGS2q9eujmL9NXtaJWKW7wxM+r+TD8rC4ny5p6B16swOO - F9cN142RgEKrM+YWGIJPnWTTNM3i7G7SgkaXIL2tNhilOkriJI3ieRTfdsatFhwczcl7QAghx/bp - K6oSvmhO4vCiNOAcq4Hm/RAh1GrpFcqcEw6ZQhoOkGuFoNrW8Vi3UO0d87XUXspOP/Uvkro2Vm9c - x3u9Ekq4bWGBOa18qENtaEtPASEf7UL7Hx2psboxWKD+BOUDJ9PknEeHE45ox1Ajk2PTNLwSV5SA - TEg3ugjljG+hHKzD+di+FHoEgtHSv8tcyz4vLlT9n/gBcA4GoSyMhVLwnwsPYxb8D/bXWH/ktjB1 - B4fQFJVQNVhjxfkbV6aosvkMNrMqzWhwCr4BAAD//wMANO06tewCAAA= + H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kypAKIRbH+pDVU+9VK2qyDibxNSxLXtRSxH/Xjm8 + gqBSLz7M7Ixn1l5FjIEsYMpA1JxEY1V8VX7d3N2/PP7czp9f39QwmTzJ5UNqqb52M+gFhZnNUdBO + dSFMYxWSNHpDC4ecMLj2x8N0NJqM+llLNKZAFWSVpTg1cSO1jAfJII2TcdyfbNW1kQI9TNl7xBhj + q/YMOXWB3zBlSW+HNOg9rxCm+yHGwBkVEODeS09cE/QOpDCaULfRu7DDcuF5iKYXSm3x9f4eZSrr + zMxv+T1eSi19nTvk3ujg6clYaNl1xNhH22dxFBGsM42lnMwn6mCY9Td2cNjigdxWBTLE1RnNkVle + IHGpfGcdILiosTgxZAz4opCmQ0SdyqdZznlvaktd/cf+QAiBlrDIrcNCirN9W/Pwxf4a26+4DQx+ + 6QmbvJS6Qmed3DxwafNhxtNEZJc8gWgd/QIAAP//AwCjNKe67gIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdedda0ceb36-SJC + - 8f425bbaab9a236e-SJC Connection: - keep-alive Content-Encoding: @@ -66,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:31 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -80,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "259" + - "231" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999845" + - "149999888" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_223a9415a5a19029f86768ffbabf3d6f + - req_427dff29f2a632ec0882c27c797f5d5a status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml index 7f73abaa..f126cb68 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 94107", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "536" + - "464" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6ggoXlwyyGKcql6Sx+qkLEXcGtsyzZSoyj/vTKQ - QNRU6oXDfDvD7MIpQAhzhjOEaU0cbbSINuXb02Z73L287ndH1ias2C8k3a0OB1Y949A7VPEJ1F1c - D1Q1WoDjSvaYGiAOfGqynKfpMl6u0g40ioHwtkq7KFXRLJ6lUbyK4sVgrBWnYHGG3gOEEDp1T19R - MvjGGYrDi9KAtaQCnF2HEMJGCa9gYi23jkiHwxFSJR3IrvV2qhsoW0t8LdkKMejn64uEqrRRhR34 - VS+55LbODRCrpA+1Tmnc0XOA0Ee3UHvTEWujGu1yp75A+sAkeezz8HjCCR2YU46IqWkR3onLGTjC - hZ1cBFNCa2CjdTwfaRlXExBMlv5d5l52vziX1X/iR0ApaAcs1wYYp7cLj2MG/A/219j1yF1hbI/W - QZOXXFZgtOH9Ny51vi5JQdZlPF/h4Bz8AAAA//8DAKuPA4PsAgAA + H4sIAAAAAAAAAwAAAP//jJI/b8MgEMV3fwrEbFf+FznxllSpOlTKVHWoKovA2abFgACrSaN89won + jR01lbow3O/e493BIUAIc4ZLhGlLHO20iJb15/3DelPvN7De7lY6fukfnxl7ylaL5RcOvUJt34G6 + H9UdVZ0W4LiSJ0wNEAfeNSmyfDabz5LFADrFQHhZo12Uq6jjkkdpnOZRXETJ/KxuFadgcYleA4QQ + OgynzykZ7HCJ4vCn0oG1pAFcXpoQwkYJX8HEWm4dkQ6HI6RKOpBD9EWexMWUGah7S3w+2Qtxrh8v + lwnVaKO29swv9ZpLbtvKALFKemPrlMYDPQYIvQ1D9Vc5sTaq065y6gOkN1wkJzs8rnKE6Zk55YiY + aLLwhlnFwBEu7GQnmBLaAhuV4wJJz7iagGAy8u8st7xPY3PZ/Md+BJSCdsAqbYBxej3v2GbA/7O/ + 2i4rHgJju7cOuqrmsgGjDT+9cq0rFqdsliXbvMDBMfgGAAD//wMAitN9t/MCAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdc06b78cf26-SJC + - 8f425bb60cfe17e4-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:24 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -79,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "291" + - "538" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999868" + - "149999891" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_89e3d88c7f12861d7e774e452300b36d + - req_9bd9d799783ab13ef59ce8e5ca7fd25f status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml index 45376f48..842cfbf0 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 94106\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 94106", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "536" + - "464" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA4ySX2uDMBTF3/0UIc912M7WzreVjv1hDPo06BiSJlfNFpMsibBS+t1H1KplHezF - h/O753ju1UOAEOYMpwjTkjhaaRHe5tuX1erpfsv3z8s4ed1sHvL1191+UT9WDk+8Q+0+gLqT64qq - SgtwXMkWUwPEgU+dJtdxnETJct6ASjEQ3lZoF8YqnEWzOIyWYbTojKXiFCxO0VuAEEKH5ukrSgbf - OEXR5KRUYC0pAKf9EELYKOEVTKzl1hHZ1u0gVdKBbFqvx7qBvLbE15K1EJ1+7F8kVKGN2tmO93rO - JbdlZoBYJX2odUrjhh4DhN6bheqzjlgbVWmXOfUJ0gdOp/M2Dw8nHNGOOeWIGJsWkwtxGQNHuLCj - i2BKaAlssA7nIzXjagSC0dK/y1zKbhfnsvhP/AAoBe2AZdoA4/R84WHMgP/B/hrrj9wUxnZvHVRZ - zmUBRhvefuNcZyS5YUtG6DTHwTH4AQAA//8DAK9WW8vsAgAA + H4sIAAAAAAAAA4ySPW/CMBCG9/wKyzOpEggfYauqbgwsnaoqMs4lmDo+y74IWsR/rxw+ElQqdfFw + z72v3zv7GDHGVcmXjMutINlYHT9X+5dXO1mtdEbYzla7w/z7sH5bi30+rfkoKHCzA0lX1ZPExmog + heaMpQNBEFzT+SSbThfTdNGBBkvQQVZbijOMG2VUPE7GWZzM43RxUW9RSfB8yd4jxhg7dmfIaUo4 + 8CVLRtdKA96LGvjy1sQYd6hDhQvvlSdhiI96KNEQmC56nqXJbMgcVK0XIZ9ptb7UT7fLNNbW4cZf + +K1eKaP8tnAgPJpg7Akt7+gpYuyjG6q9y8mtw8ZSQfgJJhjm6dmO96vs4fjCCEnogWYyemBWlEBC + aT/YCZdCbqHslf0CRVsqHIBoMPLvLI+8z2MrU//HvgdSgiUoC+ugVPJ+3r7NQfhnf7XdVtwF5v7L + EzRFpUwNzjp1fuXKFpNcZInMZyLh0Sn6AQAA//8DAL5Pl0/zAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdcb6d3b645e-SJC + - 8f425bb64ed0fa36-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:26 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -79,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "282" + - "247" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999868" + - "149999891" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_0f4462cd5dd31fe3e1a9d6847e563042 + - req_eb9ad02601ae4b1b2b579657ed9a7bef status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml index abe7094e..26df9d56 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml @@ -1,13 +1,13 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 94106 or 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 94106 or 94107", "role": "user"}], "model": "gpt-4o-mini", "temperature": + 0}' headers: accept: - application/json @@ -16,7 +16,7 @@ interactions: connection: - keep-alive content-length: - - "545" + - "473" content-type: - application/json host: @@ -36,7 +36,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -46,18 +46,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFJBS8MwGL33V4ScV+m2sm69CaLTgyjIDhMpWfK1zZYmIUnFMfbfJW3X - dqjgJYf3vvfyvpecAoQwZzhFmJbE0UqL8DbfPt8V+4U8bO6PL0/rTUnWD8XrVr59PkZ44hVqtwfq - LqobqiotwHElW5oaIA686zSZx3ESJcukISrFQHhZoV0Yq3AWzeIwWobRohOWilOwOEXvAUIInZrT - R5QMvnCKoskFqcBaUgBO+yGEsFHCI5hYy60j0uHJQFIlHcgmdTTGDeS1JT6WrIXo8HN/kVCFNmpn - O77Hcy65LTMDxCrpTa1TGjfsOUDoo1movsqItVGVdplTB5DecDpdtX54qHDEdpxTjogRPOtauLbL - GDjChR01gimhJbBBOtRHasbViAhGS/8M85t3uziXxX/sB4JS0A5Ypg0wTq8XHsYM+A/211hfchMY - 26N1UGU5lwUYbXj7xrnOVjnZkVUezZc4OAffAAAA//8DAAUNxI3sAgAA + H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyoJz3Jrq35ARakqVVVk7E0wdWzLXkQrxL9XDhBA + UKkXH2Z2xjNrbxPGQEmYMhBLTqJxOn2oNk/PzWJFr+KlmM03j/NZ8TbavMt+UVfQiwq7WKGgo+pO + 2MZpJGXNnhYeOWF0zcf9wXA4GeaTlmisRB1ltaN0YNNGGZUWWTFIs3GaTw7qpVUCA0zZR8IYY9v2 + jDmNxG+Ysqx3RBoMgdcI026IMfBWRwR4CCoQNwS9EymsITRt9HPYY7UOPEYza60P+K67R9vaebsI + B77DK2VUWJYeebAmegayDlp2lzD22fZZX0QE523jqCT7hSYa3g/3dnDa4ok8VAWyxPUNzYVZKZG4 + 0uFsHSC4WKK8MmQM+Foqe0YkZ5Wvs9zy3tdWpv6P/YkQAh2hLJ1HqcTNvq15/GJ/jXUrbgND+AmE + TVkpU6N3Xu0fuHLlqBJ5hnmGC0h2yS8AAAD//wMALlTCsO4CAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdd6bc71fa2e-SJC + - 8f425bb169ea22f6-SJC Connection: - keep-alive Content-Encoding: @@ -65,9 +65,15 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:27 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=d8n1B6AzFA1xougxyBgoPLD0ITgb.iimKMM9kNYr6NA-1734558518-1.0.1.1-c8MRCOD4wNoPcANGb9a6gOWsl6NhHqx911Ktp.RARxFa..7XVR9hKaZVQ2nRa8g.bTL2e2pT7EpsuMaFLlx6Sw; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=DPEKvT7hx6XvGnKxQqNrPq5Y4dSqkyQo4hPKRlWd79E-1734558518261-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -79,25 +85,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "249" + - "168" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999865" + - "149999888" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_72f1a2af642dad2884e52d652e775182 + - req_becb26d30d1adf2d410f311a4664a6b2 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml index 607cd8a2..5b56af9b 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: Insufficient - information\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: Insufficient + information", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "541" + - "469" content-type: - application/json host: @@ -46,18 +45,19 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6gIoDy4pS+lPeRWVWpVIWMWcGNsyzZSqyj/vTIQ - IGoq9cJhvp1hduHoIYRZgVOEaU0sbRQPtuXb/u4pP7w0z69kt3+I6221q/fN/SOJNPadQ+afQO3Z - dUNlozhYJkWPqQZiwaUuVnGSrMLVetmBRhbAna1SNkhkEIVREoTrIFwOxloyCgan6N1DCKFj93QV - RQFfOEWhf1YaMIZUgNNxCCGsJXcKJsYwY4mw2J8glcKC6FrfznUNZWuIqyVazgf9NL6Iy0ppmZuB - j3rJBDN1poEYKVyosVLhjp48hD66hdqLjlhp2SibWXkA4QIXi6jPw9MJZ3RgVlrC56bYvxKXFWAJ - 42Z2EUwJraGYrNP5SFswOQPebOnfZa5l94szUf0nfgKUgrJQZEpDwejlwtOYBveD/TU2HrkrjM23 - sdBkJRMVaKVZ/41LlW1KkpNNGcZr7J28HwAAAP//AwBPQ8gX7AIAAA== + H4sIAAAAAAAAAwAAAP//jFJNj9MwEL3nV1g+NygpLc32tnwcOC2CA0gIRa4zTgZsj9eeaIFV/zty + 2iZdsUhcfHhv3vN7o3kshJDYyb2QelCsXbDlrXl48y58+njv4i3s3n7+cPf6d1g3d1XT0Be5ygo6 + fAfNF9ULTS5YYCR/onUExZBd693LzXbbbOtmIhx1YLOsD1xuqHTosVxX601Z7cq6OasHQg1J7sXX + QgghHqc35/Qd/JR7Ua0uiIOUVA9yPw8JISPZjEiVEiZWnuVqITV5Bj9Ff+/TaAxqBM8CvaHoVO4g + mITy6QGi4AGTuB8hzd0uf4AZk8pV/GjtGT/OuSz1IdIhnfkZN+gxDW0ElcjnDIkpyIk9FkJ8m/qP + TyrJEMkFbpl+gM+GTXOyk8vWF3J35phY2QW+2a6eMWs7YIU2Xa1PaqUH6Bblsms1dkhXRHFV+e8s + z3mfaqPv/8d+IbSGwNC1IUKH+mnfZSxCPsl/jc0rngLL9CsxuNag7yGGiKeDMKF9ZXRdQV3BQRbH + 4g8AAAD//wMAIVEMVh4DAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdd11ed0175e-SJC + - 8f425bb5cdb77ac1-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +65,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:27 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -79,25 +79,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "196" + - "262" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999867" + - "149999890" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_9dd0f40823dceb910336a862f0513c68 + - req_ca5799089a4ca130483ac0a6fa172710 status: code: 200 message: OK diff --git a/tests/cassettes/test_eval_answer[llm basic].yaml b/tests/cassettes/test_eval_answer[llm basic].yaml index 63f1bb18..18f9bfd9 100644 --- a/tests/cassettes/test_eval_answer[llm basic].yaml +++ b/tests/cassettes/test_eval_answer[llm basic].yaml @@ -7,7 +7,7 @@ interactions: other output is permitted.\n\nQuestion: Which of the following is most likely true:\n\nA) Piggie, B) Pigeon, C) Gerald\n\n\nCorrect answer: C\n\nProposed answer: Based on all factors considered, the most compelling answer is Gerald, - C", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + C", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +16,7 @@ interactions: connection: - keep-alive content-length: - - "516" + - "521" content-type: - application/json host: @@ -46,18 +46,16 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJI/b8IwEMX3fArLM6kCpPzbGDpQVahSh4pWVWTsS+Li+CzbkaCI7145 - AZKqVOri4X73nt+dfYwIoVLQBaG8ZJ5XRsXL/G09e3zePyX1YbxZrhzuynKTi9WXW7/SQVDg9hO4 - v6juOFZGgZeoW8wtMA/BdTgdp+k0mc5GDahQgAqywvg4xXiUjNI4mcXJ5CwsUXJwdEHeI0IIOTZn - iKgF7OmCJINLpQLnWAF0cW0ihFpUoUKZc9J5pj0ddJCj9qCb1JuHlz6xkNeOhWC6VupcP12vUlgY - i1t35td6LrV0ZWaBOdTB1nk0tKGniJCPZqT6R0pqLFbGZx53oIPhMLlv/Wi3xB49M4+eqb5oMrhh - lwnwTCrX2wnljJcgOmm3QFYLiT0Q9Yb+HeaWdzu41MV/7DvAORgPIjMWhOQ/B+7aLIQv9lfbdclN - YOoOzkOV5VIXYI2V7SvnJpvnbMvmeTKe0egUfQMAAP//AwAWS34s7gIAAA== + H4sIAAAAAAAAA4ySy2rDMBBF9/4KoXVcnMTOa1dKCl00lAZKHxSjSGNbjSwJaUJTSv69yHnYoSl0 + o8WcuVd3RvqOCKFS0BmhvGLIa6vi6+LzZr54vh2tB/Xr0zgTi8Lh/ZY93vHlA+0FhVl9AMej6oqb + 2ipAafQecwcMIbj2x8M0yyZZf9KA2ghQQVZajFMT11LLeJAM0jgZx/3JQV0ZycHTGXmLCCHkuzlD + Ti1gS2ck6R0rNXjPSqCzUxMh1BkVKpR5Lz0yjbTXQm40gm6iv8yXXeKg2HgW0umNUof67nSVMqV1 + ZuUP/FQvpJa+yh0wb3Sw9WgsbeguIuS9GWlzlpJaZ2qLOZo16GDYT7K9H2032aEHhgaZ6opGvQt2 + uQBkUvnOTihnvALRStsFso2QpgOiztC/w1zy3g8udfkf+xZwDhZB5NaBkPx84LbNQfhnf7WdltwE + pv7LI9R5IXUJzjq5f+XC5sMpSxM+HbGERrvoBwAA//8DAJN7IxXzAgAA headers: - CF-Cache-Status: - - DYNAMIC CF-RAY: - - 8f39fdb5cae1158a-SJC + - 8f425bb118049453-SJC Connection: - keep-alive Content-Encoding: @@ -65,14 +63,14 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:22 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=lVkT7i5qloNOJW3VW5kf8Ohm6U080WiPUv6XirXCoFk-1734470782-1.0.1.1-nAgxt2GizSWkF.auEc_j1tv3Erjbd74Lsh9WJmMaZa_E8fpVuEZ8SsBIqLBHICQDV0sfwSjHgP9mTBHQujl_XA; - path=/; expires=Tue, 17-Dec-24 21:56:22 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=shlFi0WrRQqtHm9BFHA8BA_DE3OgD.WLNX_BG0MJ.Uc-1734558518-1.0.1.1-dTPiGPfeRXm4eFyNx5Qhh98ITpHISNJJ15gnJl7VfBbOzj3CoF.H.Mssss_WvoWjPSiaq4ZWwBKCF16.mbMFig; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=YCWb3aZdtzEmsWTuiPgC.gchnL7jvJLEWh9yvJqAiAw-1734470782603-0.0.1.1-604800000; + - _cfuvid=LbfayFWmgFkPH4gOfhOfLicD7koAa3IqwrVpt0Q2uQ0-1734558518270-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -82,28 +80,30 @@ interactions: - X-Request-ID alt-svc: - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC openai-organization: - future-house-xr4tdh openai-processing-ms: - - "124" + - "226" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29998" x-ratelimit-remaining-tokens: - - "29999876" + - "149999877" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_84a9ec4746765b74e4d84610ebc880ad + - req_c627f8d13c1969c6fd3a26f94a43a44f status: code: 200 message: OK diff --git a/tests/cassettes/test_eval_llm_config.yaml b/tests/cassettes/test_eval_llm_config.yaml index 383479dc..7268d855 100644 --- a/tests/cassettes/test_eval_llm_config.yaml +++ b/tests/cassettes/test_eval_llm_config.yaml @@ -5,7 +5,7 @@ interactions: question, and a proposed answer to the question. Please tell me if the proposed answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No other output is permitted.\n\nQuestion: What is 25 * 10?\n\nCorrect answer: - 250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o", "temperature": + 250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0.5}' headers: accept: @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "387" + - "392" content-type: - application/json host: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1ycH+okt0Ja6KUthBzaUowirW2lslZIG0gJefci - x4kT2kIvPsy3M55de58wxrXic8ZlLUg2zqR35dvTLF+8PJgRPhLUrtwszWKVPa92M+CD6MD1BiSd - XDcSG2eANNojlh4EQUwd5uPJJM/y6bgFDSow0VY5SieYjrLRJM2maXbbGWvUEgKfs/eEMcb27TNW - tAp2fM6ywUlpIARRAZ+fhxjjHk1UuAhBBxKW+KCHEi2BbVu/3i8viYdyG0QsZrfGdPrh/CqDlfO4 - Dh0/66W2OtSFBxHQxthA6HhLDwljH+1K26uW3HlsHBWEn2BjYD47xvH+hj0cdoyQhOnlaXeF67BC - AQltwsVFuBSyBtU7+/OJrdJ4AZKLlX92+S37uLa21X/ieyAlOAJVOA9Ky+t9+zEP8Qf7a+x84rYw - D1+BoClKbSvwzuvjNy5dIfKZmiohhyVPDsk3AAAA//8DADQLsKzsAgAA + H4sIAAAAAAAAAwAAAP//jJJfa4MwFMXf/RQhz3Vo/8zWt1H60LExaAdjjCFpctVsMQlJpCul333E + WrVsg734cH73HM+9egwQwpzhFGFaEkcrLcK7fL9cMbp+vF/L/fLpsH3eVGabLDb5w+QFj7xD7T6A + uovrhqpKC3BcyTOmBogDnxonk+lsNp/F8wZUioHwtkK7cKrCiksejqPxNIySMJ637lJxChan6C1A + CKFj8/Q9JYMvnKJodFEqsJYUgNNuCCFslPAKJtZy64h0eNRDqqQD2VR/XW2HxEBeW+LbyVqIVj91 + rxKq0EbtbMs7PeeS2zIzQKySPtY6pXFDTwFC781K9VVLrI2qtMuc+gTpA5PFOQ73h+xh3DKnHBG9 + PG+vcB2WMXCECzu4CKaElsB6Z38+UjOuBiAYrPyzy2/Z57W5LP4T3wNKQTtgmTbAOL3etx8z4P+y + v8a6EzeFsT1YB1WWc1mA0Yafv3Gus9ucxhHEEexwcAq+AQAA//8DAPSiOYXxAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdbac9db643b-SJC + - 8f425bb65e7c6453-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:23 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -78,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "231" + - "229" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999909" + - "149999909" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_6538a77713d1ec9b61a8e15f3cf37377 + - req_1f8e0a7bd96417a061e010db00f50b6f status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer[complex].yaml b/tests/cassettes/test_extract_answer[complex].yaml new file mode 100644 index 00000000..8b076955 --- /dev/null +++ b/tests/cassettes/test_extract_answer[complex].yaml @@ -0,0 +1,109 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial + unrest\nPolitical corruption\n\nProposed answer: Based on the context given, + Serif et al. (2026) claim that the overwhelming cause of regime collapse arises + from economic factors. Yet, most other scholars (Gerald and Robinson for example) + believe the collapse was due to social unrest because of the prolonged epidemic + of 2025. I tend to agree with the majority - although I can see both sides. + Thus my response is that the social unrest was the significant factor in the + collapse of the regime.", "role": "user"}], "model": "gpt-4o-mini", "temperature": + 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "866" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1zsNE5S30qhhzZQaKE5lGIUaW2rkbVCkklDyLsX + OT92aAq96LDfzmh2pV1ECJWC5oTymnneGBXfl5uHx4VKt7PNy3KdLerpUj8/vavXxtkNHQUFrr6A + +5PqhmNjFHiJ+oC5BeYhuKaz20mWzbP0rgMNClBBVhkfTzBupJbxOBlP4mQWp/OjukbJwdGcfESE + ELLrzpBTC/imOUlGp0oDzrEKaH5uIoRaVKFCmXPSeaY9HfWQo/agu+hvyCVTpNUW3EWPhbJ1LOTU + rVLH+v58qcLKWFy5Iz/XS6mlqwsLzKEOFziPhnZ0HxHy2Q3XXuSlxmJjfOFxDToYptPs4Ef7nfZ0 + fGQePVND0Wx0xa4Q4JlUbrAdyhmvQfTSfpWsFRIHIBoM/TvMNe/D4FJX/7HvAedgPIjCWBCSXw7c + t1kIP+6vtvOSu8DUbZ2HpiilrsAaKw/vXZpiWvI0gTSBFY320Q8AAAD//wMA8VLBff0CAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f425bb69fe5250c-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:48:39 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "491" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29999" + x-ratelimit-remaining-tokens: + - "149999790" + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_0446ed4c188b77427f33f74f91e0d112 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer[empty-proposal].yaml b/tests/cassettes/test_extract_answer[empty-proposal].yaml new file mode 100644 index 00000000..576fc15e --- /dev/null +++ b/tests/cassettes/test_extract_answer[empty-proposal].yaml @@ -0,0 +1,108 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: + ", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "374" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "0" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFJNS8QwFLz3V4R33kq72/3qTUQ8CSLiRaSkyWsbTZOQpKgs+98l3d22 + y67gJYeZN5OZl+wiQkBwyAmwhnrWGhnfVl939zxzDy+CCfO6KB+TLadl6rbPTyuYBYUuP5D5k+qG + 6dZI9EKrA80sUo/BNV0vsuVys0w3PdFqjjLIauPjTMetUCKeJ/MsTtZxujmqGy0YOsjJW0QIIbv+ + DDkVx2/ISTI7IS06R2uEfBgiBKyWAQHqnHCeKg+zkWRaeVR99ClsseocDdFUJ+UR3w/3SF0bq0t3 + 5Ae8Ekq4prBInVbB03ltoGf3ESHvfZ/uLCIYq1vjC68/UQXDdXqwg3GLI3msCl57Kq9ozswKjp4K + 6SbrAEZZg/zCkBCgHRd6QkSTypdZrnkfagtV/8d+JBhD45EXxiIX7Grf3jx8sb/GhhX3gcH9OI9t + UQlVozVWHB64MsWqYmmCaYIlRPvoFwAA//8DACbc6TvuAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f425bb11ee3eb30-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:48:38 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=3EbR6c_9nmNeI58TWDLCiyFbbzWnxiCAQfgz1Ou5oXQ-1734558518-1.0.1.1-_OVXY1MiEfz9j5Sl02ocx_beYJRhzMj_5kdzhk9Gq_NIORYBNM4OqmSmTCUwNu.EObKQiWZdQdrwqZ84sr8.cQ; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=U.00GXQIFA3gE8IldpDjXxcp1niJXAkehSRhHT85pWs-1734558518279-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "249" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29998" + x-ratelimit-remaining-tokens: + - "149999913" + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_63d20bd456f7f2145bc66a3ae269bc1e + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer[gave-two].yaml b/tests/cassettes/test_extract_answer[gave-two].yaml new file mode 100644 index 00000000..a529a885 --- /dev/null +++ b/tests/cassettes/test_extract_answer[gave-two].yaml @@ -0,0 +1,108 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: + A or B", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "380" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "0" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFJNTwIxFLzvr2jemTW7fAhyI8TozUSjiRizKd23S7Xb17QPlRD+u+mC + gAETLz3MvJnOvHadCAG6hLEAtZCsGmfSSfU5vX6YXd2E2YRGt4Mp493qqZp/PD7jPXSiguZvqPhH + daGocQZZk93SyqNkjK75sNcfDEaDfNQSDZVooqx2nPYpbbTVaTfr9tNsmOajnXpBWmGAsXhJhBBi + 3Z4xpy3xC8Yi6/wgDYYga4TxfkgI8GQiAjIEHVhahs6BVGQZbRv9GPZYLYOM0ezSmB2+2d9jqHae + 5mHH7/FKWx0WhUcZyEbPwOSgZTeJEK9tn+WviOA8NY4Lpne00XDY29rBYYsHclcVmFiaM5pfZkWJ + LLUJR+sAJdUCyxNDIUAuS01HRHJU+TTLOe9tbW3r/9gfCKXQMZaF81hqdbZvax6/2F9j+xW3gSGs + AmNTVNrW6J3X2weuXHFZqTzDPMM5JJvkGwAA//8DAOjXCFXuAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f425bb11e1069a2-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:48:38 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=eAk9PjLOP_uC98HrFuiPUxdGbMOD0FndASetRInyC8E-1734558518-1.0.1.1-czBHIlZrAXhRtJiNtQMJ4FNObmpYfP0sPzRSb84VB2iiFfmBNMFsZOSzB8kN5BWGvHDUXsKgWJTphYPTQzM3FA; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=dYXAYAvcpEWoKaqCouzZ9rcGFRQEzhYA4XzFKsQi83I-1734558518200-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "171" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29999" + x-ratelimit-remaining-tokens: + - "149999912" + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_efdaa27fda18e26d87bcadcc80237c76 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer[not in options].yaml b/tests/cassettes/test_extract_answer[not in options].yaml new file mode 100644 index 00000000..70884d60 --- /dev/null +++ b/tests/cassettes/test_extract_answer[not in options].yaml @@ -0,0 +1,108 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer: + F", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "372" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "0" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9NyjpM/QGiAsSEuICEkKR62wSg2Nb9rY8qv47chLa + VC0SFx9mdsYza28jxkAWsGQgak6isSq+Kj9ubt0dyflYyIfy8b7mT5ss3Txff6sJjILCrN5Q0K/q + QpjGKiRpdEcLh5wwuKaLyXQ2y2Zp1hKNKVAFWWUpnpq4kVrG42Q8jZNFnGa9ujZSoIcle4kYY2zb + niGnLvATliwZ/SINes8rhOV+iDFwRgUEuPfSE9cEowMpjCbUbfQh7LBcex6i6bVSPb7b36NMZZ1Z + +Z7f46XU0te5Q+6NDp6ejIWW3UWMvbZ91kcRwTrTWMrJvKMOhvPLzg4OWzyQfVUgQ1yd0RyZ5QUS + l8oP1gGCixqLE0PGgK8LaQZENKh8muWcd1db6uo/9gdCCLSERW4dFlKc7duahy/219h+xW1g8F+e + sMlLqSt01snugUubz0uRJpgmuIJoF/0AAAD//wMAUYws+e4CAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f425bb11ca22513-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:48:38 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=ABTDwd4t79cPLIko1hPlFoZXxUQ6rzPq8jHwq1Xy7XE-1734558518-1.0.1.1-Qqt3v2jz7xPx17Fx0ehWguxbmaMuZk4B3NM4Z1HW2aMmaaTMq2RvfX.y5A9X5qv4xoO0qWDJdyM.E9ahp.RW5A; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=17oj8YL1hlYLaR7o.N8HEjWKDALCyYtBfmHe30jFAG0-1734558518262-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "232" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29996" + x-ratelimit-remaining-tokens: + - "149992191" + x-ratelimit-reset-requests: + - 7ms + x-ratelimit-reset-tokens: + - 3ms + x-request-id: + - req_e11e29110308fec0a5310bf18d49c27d + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_utils.py b/tests/test_utils.py index 3469f01c..0962e24c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import pytest -from aviary.core import eval_answer +from aviary.core import eval_answer, extract_answer from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion from tests.conftest import VCR_DEFAULT_MATCH_ON @@ -39,6 +39,35 @@ async def test_eval_answer( assert await eval_answer(proposed, correct, question, eval_mode) == expected +@pytest.mark.vcr +@pytest.mark.parametrize( + ("proposed_answer", "options", "expected"), + [ + pytest.param("A", ["A", "B", "C"], "A", id="exact-uppercase"), + pytest.param("a", ["A", "B", "C"], "A", id="exact-lowercase"), + pytest.param("F", ["B", "C"], None, id="not in options"), + pytest.param("A or B", ["A", "B", "C"], None, id="gave-two"), + pytest.param( + "Based on the context given, Serif et al. (2026) claim that " + "the overwhelming cause of regime collapse arises from economic factors. " + "Yet, most other scholars (Gerald and Robinson for example) believe the collapse " + "was due to social unrest because of the prolonged epidemic of 2025. I tend to agree " + "with the majority - although I can see both sides. Thus my response " + "is that the social unrest was the significant factor in the collapse of the regime.", + ["Economic factors", "Social unrest", "Political corruption"], + "Social unrest", + id="complex", + ), + pytest.param("", ["A", "B", "C"], None, id="empty-proposal"), + ], +) +@pytest.mark.asyncio +async def test_extract_answer( + proposed_answer: str, options: Sequence[str], expected: str | None +) -> None: + assert await extract_answer(proposed_answer, options) == expected + + @pytest.mark.vcr @pytest.mark.asyncio async def test_eval_llm_config(): @@ -108,7 +137,7 @@ def _assert_prompt_is_valid( *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, "the answer is 14004", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="didnt-match-and-no-llm-innate-knowledge", ), pytest.param( @@ -129,35 +158,35 @@ def _assert_prompt_is_valid( *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, "the answer is 94106 or 94107", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="matched-several-options", ), pytest.param( *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, "", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="empty-answer1", ), pytest.param( *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, "14", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="didnt-match-and-llm-has-innate-knowledge", ), pytest.param( *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, "", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="empty-answer2", ), pytest.param( *LITQA2_QUESTION_IDEAL_DISTRACTORS, "", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="empty-answer3", ), ], @@ -169,7 +198,7 @@ async def test_grade( distractors: str | list[str], actual_answer: str, expected_eval: MultipleChoiceEvaluation, - expected_extracted_answer: str, + expected_extracted_answer: str | None, ) -> None: """Tests that we can create a multiple choice question and evaluate answers.""" mc_question = MultipleChoiceQuestion( @@ -179,7 +208,7 @@ async def test_grade( shuffle_seed=42, # Seed for VCR cassette ) self._assert_prompt_is_valid(mc_question, question, ideal_answer, distractors) - evaluation, _, graded_answer = await mc_question.grade(actual_answer) + evaluation, graded_answer = await mc_question.grade(actual_answer) assert evaluation == expected_eval if evaluation == MultipleChoiceEvaluation.CORRECT: assert graded_answer == ideal_answer