diff --git a/Dockerfile b/Dockerfile index adbbc2f..305e171 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,7 @@ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY ./tests /app/tests -COPY ./lib /app/lib -COPY ./src /app/src +COPY . . EXPOSE 80 -CMD ["waitress-serve", "--host=0.0.0.0", "--port=80", "--call", "src:create_app"] +CMD ["waitress-serve", "--host=0.0.0.0", "--port=80", "--call", "aiproxy.app:create_app"] diff --git a/README.md b/README.md index 2fdb022..eae4cc0 100644 --- a/README.md +++ b/README.md @@ -119,10 +119,9 @@ Install requirements to the virtual environment with pip: Export the following environment variables (or add them once to your shell profile) * `export OPENAI_API_KEY=` -* `export PYTHONPATH=` See rubric tester options with: -* `python lib/assessment/rubric_tester.py --help` +* `bin/rubric_tester --help` ### example usage @@ -132,7 +131,7 @@ GPT 3.5 Turbo is the default because a complete test run with that model costs o A recommended first run is to use default experiment and dataset, limited to 1 lesson: ``` -(.venv) Dave-MBP:~/src/aiproxy (rt-recover-from-bad-llm-responses)$ python ./lib/assessment/rubric_tester.py --lesson-names csd3-2023-L11 +(.venv) Dave-MBP:~/src/aiproxy (rt-recover-from-bad-llm-responses)$ bin/rubric_tester --lesson-names csd3-2023-L11 2024-02-13 20:15:30,127: INFO: Evaluating lesson csd3-2023-L11 for dataset contractor-grades-batch-1-fall-2023 and experiment ai-rubrics-pilot-gpt-3.5-turbo... ``` @@ -150,7 +149,7 @@ The report that gets generated will contain a count of how many errors there wer In order to rerun only the failed student projects, you can pass the `-c` (`--use-cached`) option: ```commandline -(.venv) Dave-MBP:~/src/aiproxy (rt-recover-from-bad-llm-responses)$ python ./lib/assessment/rubric_tester.py --lesson-names csd3-2023-L11 -c +(.venv) Dave-MBP:~/src/aiproxy (rt-recover-from-bad-llm-responses)$ bin/rubric_tester --lesson-names csd3-2023-L11 -c ``` ![Screenshot 2024-02-13 at 8 24 31 PM](https://github.com/code-dot-org/aiproxy/assets/8001765/ff560302-94b9-4966-a5d6-7d9a9fa54892) @@ -163,7 +162,7 @@ After enough reruns, you'll have a complete accuracy measurement for the lesson. experiments run against GPT 4, GPT 4 Turbo and other pricey models should include report html and cached response data. this allows you to quickly view reports for these datasets either by looking directly at the `output/report*html` files or by regenerating the report against cached data via a command like: ```commandline -python ./lib/assessment/rubric_tester.py --experiment-name ai-rubrics-pilot-baseline-gpt-4-turbo --use-cached +bin/rubric_tester --experiment-name ai-rubrics-pilot-baseline-gpt-4-turbo --use-cached ``` #### smaller test runs diff --git a/lib/__init__.py b/aiproxy/__init__.py similarity index 100% rename from lib/__init__.py rename to aiproxy/__init__.py diff --git a/src/__init__.py b/aiproxy/app/__init__.py similarity index 92% rename from src/__init__.py rename to aiproxy/app/__init__.py index 0dafb2a..99b16ce 100644 --- a/src/__init__.py +++ b/aiproxy/app/__init__.py @@ -5,9 +5,9 @@ import logging # Our modules -from src.test import test_routes -from src.openai import openai_routes -from src.assessment import assessment_routes +from .test import test_routes +from .openai import openai_routes +from .assessment import assessment_routes # Flask from flask import Flask diff --git a/src/assessment.py b/aiproxy/app/assessment.py similarity index 96% rename from src/assessment.py rename to aiproxy/app/assessment.py index 6735d18..b827454 100644 --- a/src/assessment.py +++ b/aiproxy/app/assessment.py @@ -7,12 +7,13 @@ import openai import json -from lib.assessment.config import DEFAULT_MODEL +from aiproxy.assessment.config import DEFAULT_MODEL # Our assessment code -from lib.assessment import assess -from lib.assessment.assess import KeyConceptError -from lib.assessment.label import InvalidResponseError +from aiproxy.assessment import assess +from aiproxy.assessment import assess +from aiproxy.assessment.assess import KeyConceptError +from aiproxy.assessment.label import InvalidResponseError assessment_routes = Blueprint('assessment_routes', __name__) diff --git a/src/openai.py b/aiproxy/app/openai.py similarity index 100% rename from src/openai.py rename to aiproxy/app/openai.py diff --git a/src/test.py b/aiproxy/app/test.py similarity index 100% rename from src/test.py rename to aiproxy/app/test.py diff --git a/lib/assessment/__init__.py b/aiproxy/assessment/__init__.py similarity index 100% rename from lib/assessment/__init__.py rename to aiproxy/assessment/__init__.py diff --git a/lib/assessment/assess.py b/aiproxy/assessment/assess.py similarity index 93% rename from lib/assessment/assess.py rename to aiproxy/assessment/assess.py index 2909ced..ad93bb0 100644 --- a/lib/assessment/assess.py +++ b/aiproxy/assessment/assess.py @@ -7,8 +7,8 @@ import logging # Import our support classes -from lib.assessment.config import SUPPORTED_MODELS, DEFAULT_MODEL, VALID_LABELS -from lib.assessment.label import Label +from .config import SUPPORTED_MODELS, DEFAULT_MODEL, VALID_LABELS +from .label import Label class KeyConceptError(Exception): pass diff --git a/lib/assessment/config.py b/aiproxy/assessment/config.py similarity index 100% rename from lib/assessment/config.py rename to aiproxy/assessment/config.py diff --git a/lib/assessment/label.py b/aiproxy/assessment/label.py similarity index 99% rename from lib/assessment/label.py rename to aiproxy/assessment/label.py index 577f5de..5cfb889 100644 --- a/lib/assessment/label.py +++ b/aiproxy/assessment/label.py @@ -9,7 +9,7 @@ from threading import Lock from typing import List, Dict, Any -from lib.assessment.config import VALID_LABELS +from .config import VALID_LABELS from io import StringIO diff --git a/lib/assessment/report.py b/aiproxy/assessment/report.py similarity index 99% rename from lib/assessment/report.py rename to aiproxy/assessment/report.py index 5a68c8d..99a6d4b 100644 --- a/lib/assessment/report.py +++ b/aiproxy/assessment/report.py @@ -4,7 +4,7 @@ import json import math from typing import List, Dict, Any -from lib.assessment.config import VALID_LABELS +from .config import VALID_LABELS class Report: def _compute_pass_fail_cell_color(self, actual, predicted, passing_labels): diff --git a/lib/assessment/rubric_tester.py b/aiproxy/assessment/rubric_tester.py similarity index 97% rename from lib/assessment/rubric_tester.py rename to aiproxy/assessment/rubric_tester.py index 4a085e3..4a005db 100644 --- a/lib/assessment/rubric_tester.py +++ b/aiproxy/assessment/rubric_tester.py @@ -1,28 +1,30 @@ -#!/usr/bin/env python - -# Make sure the caller sees a helpful error message if they try to run this script with Python 2 -f"This script requires {'Python 3'}. Please be sure to activate your virtual environment via `source .venv/bin/activate`." +#!/usr/bin/env python3 import argparse +import boto3 +import concurrent.futures import csv import glob -import json -import time -import os -from multiprocessing import Pool -import concurrent.futures import io +import json import logging +import os import pprint -import boto3 import subprocess +import sys +import time -from sklearn.metrics import accuracy_score, confusion_matrix +from multiprocessing import Pool from collections import defaultdict -from lib.assessment.config import SUPPORTED_MODELS, DEFAULT_MODEL, VALID_LABELS, LESSONS, DEFAULT_DATASET_NAME, DEFAULT_EXPERIMENT_NAME -from lib.assessment.label import Label, InvalidResponseError -from lib.assessment.report import Report +from sklearn.metrics import accuracy_score, confusion_matrix + +from .config import SUPPORTED_MODELS, DEFAULT_MODEL, VALID_LABELS, LESSONS, DEFAULT_DATASET_NAME, DEFAULT_EXPERIMENT_NAME +from .label import Label, InvalidResponseError +from .report import Report + +if 'OPEN_AI_KEY' not in os.environ: + print("Warning: OPEN_AI_KEY environment variable is not set.", file=sys.stderr) #globals prompt_file = 'system_prompt.txt' diff --git a/bin/assessment-test.rb b/bin/assessment-test.rb index 2c98e15..3385e7a 100755 --- a/bin/assessment-test.rb +++ b/bin/assessment-test.rb @@ -1,4 +1,4 @@ -#!/bin/env ruby +#!/usr/bin/env ruby require 'net/http' require 'uri' diff --git a/bin/rubric_tester b/bin/rubric_tester new file mode 100755 index 0000000..2844bc7 --- /dev/null +++ b/bin/rubric_tester @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Set current working dir to ../ +cd "$(dirname "$0")"/.. + +source .venv/bin/activate +python3 -m aiproxy.assessment.rubric_tester "$@" diff --git a/run.py b/run.py new file mode 100755 index 0000000..8c243b0 --- /dev/null +++ b/run.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +from aiproxy.app import create_app + +app = create_app() + +if __name__ == '__main__': + app.run(debug=True) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e8b7f92 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +setup( + name='aiproxy', + version='0.1', + packages=find_packages(), + install_requires=[line.strip() for line in open('requirements.txt')], + entry_points={ + 'console_scripts': [ + 'rubric_tester=aiproxy.assessment.rubric_tester:main', + 'aiproxy=aiproxy.app:create_app', + ] + }, +) diff --git a/tests/accuracy/test_accuracy.py b/tests/accuracy/test_accuracy.py index 4da7f73..52f9ebb 100644 --- a/tests/accuracy/test_accuracy.py +++ b/tests/accuracy/test_accuracy.py @@ -3,7 +3,7 @@ from unittest import mock -from lib.assessment.rubric_tester import ( +from aiproxy.assessment.rubric_tester import ( main, ) diff --git a/tests/conftest.py b/tests/conftest.py index f459c5f..89d87e5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,7 @@ import pytest -from src import create_app +from aiproxy.app import create_app import contextlib import os diff --git a/tests/routes/test_assessment_routes.py b/tests/routes/test_assessment_routes.py index 5fd726e..1a8c590 100644 --- a/tests/routes/test_assessment_routes.py +++ b/tests/routes/test_assessment_routes.py @@ -45,7 +45,7 @@ def test_should_return_400_when_no_rubric(self, client, randomstring): assert response.status_code == 400 def test_should_return_400_on_openai_error(self, mocker, client, randomstring): - mocker.patch('lib.assessment.assess.label').side_effect = openai.error.InvalidRequestError('', '') + mocker.patch('aiproxy.assessment.assess.label').side_effect = openai.error.InvalidRequestError('', '') response = client.post('/assessment', data={ "code": randomstring(10), "prompt": randomstring(10), @@ -88,7 +88,7 @@ def test_should_return_400_when_passing_not_a_number_to_temperature(self, client assert response.status_code == 400 def test_should_return_400_when_the_label_function_does_not_return_data(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') label_mock.return_value = [] response = client.post('/assessment', data={ @@ -106,7 +106,7 @@ def test_should_return_400_when_the_label_function_does_not_return_data(self, mo assert response.status_code == 400 def test_should_return_400_when_the_label_function_does_not_return_the_right_structure(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') label_mock.return_value = { 'metadata': {}, 'data': {} @@ -127,7 +127,7 @@ def test_should_return_400_when_the_label_function_does_not_return_the_right_str assert response.status_code == 400 def test_should_pass_arguments_to_label_function(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') data = { "code": randomstring(10), "prompt": randomstring(10), @@ -155,7 +155,7 @@ def test_should_pass_arguments_to_label_function(self, mocker, client, randomstr ) def test_should_return_the_result_from_label_function_when_valid(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') label_mock.return_value = { 'metadata': {}, 'data': [ @@ -190,7 +190,7 @@ class TestPostTestAssessment: """ def test_should_return_400_on_openai_error(self, mocker, client, randomstring): - mocker.patch('lib.assessment.assess.label').side_effect = openai.error.InvalidRequestError('', '') + mocker.patch('aiproxy.assessment.assess.label').side_effect = openai.error.InvalidRequestError('', '') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) response = client.post('/test/assessment', data={ @@ -236,7 +236,7 @@ def test_should_return_400_when_passing_not_a_number_to_temperature(self, mocker assert response.status_code == 400 def test_should_return_400_when_the_label_function_does_not_return_data(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) label_mock.return_value = [] @@ -255,7 +255,7 @@ def test_should_return_400_when_the_label_function_does_not_return_data(self, mo assert response.status_code == 400 def test_should_return_400_when_the_label_function_does_not_return_the_right_structure(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) label_mock.return_value = { @@ -277,7 +277,7 @@ def test_should_return_400_when_the_label_function_does_not_return_the_right_str assert response.status_code == 400 def test_should_pass_arguments_to_label_function(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) data = { @@ -305,7 +305,7 @@ def test_should_pass_arguments_to_label_function(self, mocker, client, randomstr ) def test_should_return_the_result_from_label_function_when_valid(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) label_mock.return_value = { @@ -341,7 +341,7 @@ class TestPostBlankAssessment: """ def test_should_return_400_on_openai_error(self, mocker, client, randomstring): - mocker.patch('lib.assessment.assess.label').side_effect = openai.error.InvalidRequestError('', '') + mocker.patch('aiproxy.assessment.assess.label').side_effect = openai.error.InvalidRequestError('', '') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) response = client.post('/test/assessment/blank', data={ @@ -384,7 +384,7 @@ def test_should_return_400_when_passing_not_a_number_to_temperature(self, mocker assert response.status_code == 400 def test_should_return_400_when_the_label_function_does_not_return_data(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) label_mock.return_value = [] @@ -402,7 +402,7 @@ def test_should_return_400_when_the_label_function_does_not_return_data(self, mo assert response.status_code == 400 def test_should_return_400_when_the_label_function_does_not_return_the_right_structure(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) label_mock.return_value = { @@ -423,7 +423,7 @@ def test_should_return_400_when_the_label_function_does_not_return_the_right_str assert response.status_code == 400 def test_should_pass_arguments_including_blank_code_to_label_function(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) data = { @@ -450,7 +450,7 @@ def test_should_pass_arguments_including_blank_code_to_label_function(self, mock ) def test_should_return_the_result_from_label_function_when_valid(self, mocker, client, randomstring): - label_mock = mocker.patch('lib.assessment.assess.label') + label_mock = mocker.patch('aiproxy.assessment.assess.label') mock_open = mocker.mock_open(read_data='file data') mock_file = mocker.patch('builtins.open', mock_open) label_mock.return_value = { diff --git a/tests/unit/assessment/test_assessment.py b/tests/unit/assessment/test_assessment.py index c2cc7df..69704f6 100644 --- a/tests/unit/assessment/test_assessment.py +++ b/tests/unit/assessment/test_assessment.py @@ -2,14 +2,14 @@ import pytest -from lib.assessment.label import Label -from lib.assessment.assess import label, KeyConceptError +from aiproxy.assessment.label import Label +from aiproxy.assessment.assess import label, KeyConceptError def test_label_should_pass_arguments_along( mocker, code, prompt, rubric, examples, openai_api_key, llm_model, num_responses, temperature, remove_comments): - """ Tests lib.assessment.assess.label() + """ Tests aiproxy.assessment.assess.label() """ # import test data @@ -52,7 +52,7 @@ def test_label_should_pass_arguments_along( def test_label_should_set_api_key_in_env_var( mocker, code, prompt, rubric, examples, openai_api_key, llm_model, num_responses, temperature, remove_comments): - """ Tests lib.assessment.assess.label() + """ Tests aiproxy.assessment.assess.label() """ # Mock the Label() class @@ -75,7 +75,7 @@ def test_label_should_set_api_key_in_env_var( def test_label_should_return_empty_result_when_no_api_key( mocker, code, prompt, rubric, examples, llm_model, num_responses, temperature, remove_comments): - """ Tests lib.assessment.assess.label() (without an api-key) + """ Tests aiproxy.assessment.assess.label() (without an api-key) """ # Mock the Label() class @@ -97,7 +97,7 @@ def test_label_should_return_empty_result_when_no_api_key( def test_label_should_return_empty_result_when_example_and_rubric_key_concepts_mismatch( mocker, code, prompt, rubric, examples, openai_api_key, llm_model, num_responses, temperature, remove_comments): - """ Tests lib.assessment.assess.label() (without an api-key) + """ Tests aiproxy.assessment.assess.label() (without an api-key) """ # Mock the Label() class label_student_work = mocker.patch.object(Label, 'label_student_work') @@ -122,7 +122,7 @@ def test_label_should_return_empty_result_when_example_and_rubric_key_concepts_m def test_label_should_call_label_student_work_with_api_key_in_env_var( mocker, code, prompt, rubric, examples, openai_api_key, llm_model, num_responses, temperature, remove_comments): - """ Tests lib.assessment.assess.label() (without an api-key) + """ Tests aiproxy.assessment.assess.label() (without an api-key) """ # Set the environment variable diff --git a/tests/unit/assessment/test_label.py b/tests/unit/assessment/test_label.py index d3a66fc..a99b0ea 100644 --- a/tests/unit/assessment/test_label.py +++ b/tests/unit/assessment/test_label.py @@ -9,7 +9,7 @@ import requests import pytest -from lib.assessment.label import Label, InvalidResponseError +from aiproxy.assessment.label import Label, InvalidResponseError @pytest.fixture @@ -439,7 +439,7 @@ def test_should_pass_arguments_to_openai(self, requests_mock, mocker, openai_gpt assert requests_mock.last_request.json()['messages'] == messages def test_should_raise_timeout(self, mocker, label, prompt, rubric, code, student_id, examples, num_responses, temperature, llm_model): - mocker.patch('lib.assessment.label.requests.post', side_effect = requests.exceptions.ReadTimeout()) + mocker.patch('aiproxy.assessment.label.requests.post', side_effect = requests.exceptions.ReadTimeout()) # Mock out compute_messages compute_messages = mocker.patch.object(Label, 'compute_messages') @@ -530,7 +530,7 @@ def test_should_open_cached_responses_when_asked_and_they_exist(self, mocker, la mock_file = mocker.patch('builtins.open', mock_open) # Mock the file exists - exists_mock = mocker.patch('lib.assessment.label.os.path.exists', return_value=True) + exists_mock = mocker.patch('aiproxy.assessment.label.os.path.exists', return_value=True) result = label.label_student_work( prompt, rubric, code, student_id, @@ -555,7 +555,7 @@ def test_should_write_cached_responses_when_asked(self, mocker, label, assessmen mock_file = mocker.patch('builtins.open', mock_open) # Mock the file so it does not exist - exists_mock = mocker.patch('lib.assessment.label.os.path.exists', return_value=False) + exists_mock = mocker.patch('aiproxy.assessment.label.os.path.exists', return_value=False) # Get mocks statically_label_student_work_mock = mocker.patch.object( diff --git a/tests/unit/assessment/test_report.py b/tests/unit/assessment/test_report.py index b4079b9..1ef6f0b 100644 --- a/tests/unit/assessment/test_report.py +++ b/tests/unit/assessment/test_report.py @@ -4,7 +4,7 @@ import pytest import random -from lib.assessment.report import Report +from aiproxy.assessment.report import Report @pytest.fixture diff --git a/tests/unit/assessment/test_rubric_tester.py b/tests/unit/assessment/test_rubric_tester.py index 321df79..692d363 100644 --- a/tests/unit/assessment/test_rubric_tester.py +++ b/tests/unit/assessment/test_rubric_tester.py @@ -5,7 +5,7 @@ from unittest import mock from types import SimpleNamespace -from lib.assessment.rubric_tester import ( +from aiproxy.assessment.rubric_tester import ( read_and_label_student_work, get_passing_labels, read_inputs, @@ -19,7 +19,7 @@ get_examples, ) -from lib.assessment.label import Label, InvalidResponseError +from aiproxy.assessment.label import Label, InvalidResponseError class TestReadAndLabelStudentWork: @@ -321,8 +321,8 @@ class TestMain: class TestInit: def test_should_call_main_when_running_by_itself(self, mocker): - main_mock = mocker.patch('lib.assessment.rubric_tester.main') - mocker.patch('lib.assessment.rubric_tester.__name__', '__main__') + main_mock = mocker.patch('aiproxy.assessment.rubric_tester.main') + mocker.patch('aiproxy.assessment.rubric_tester.__name__', '__main__') init()