Skip to content

Commit

Permalink
Prompt Engineering (#4)
Browse files Browse the repository at this point in the history
* Grader

* Better prompt, better results

* Prompt engineering
  • Loading branch information
s2t2 authored Dec 16, 2023
1 parent 8ffca9f commit 50498ec
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 22 deletions.
9 changes: 5 additions & 4 deletions app/response_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from typing import List
from pydantic import BaseModel, Field

COMMENTS = "The comment to accompany the score. Provides justification for the score. Cites specific content present or absent from the response."
CONFIDENCE_SCORE = "Confidence level in the score. Values range between 0 (low confidence) and 1 (high confidence)"
COMMENTS = "Comment to accompany the score."
CONFIDENCE_SCORE = "Confidence level in the score. Values range from 0.0 (low) to 1.0 (high confidence)."

ONE_TO_FIVE_SCORE = "The score. Values range from 1 (low) to 5 (high), in increments of 0.25 (where 1 is poor, 3 is decent, 4 is good, 4.5 is great, and 5 is perfect). Indicates the degree to which the response completely, thoroughly, and accurately addresses all the questions."
ZERO_TO_ONE_SCORE = "The score. Values range from 0 (low) to 1 (high), in increments of 0.05 (where 0 is unattempted or blank, 0.5 is incorrect or incomplete, 0.75 is good, 0.9 is great, and 1.0 is perfect). Indicates the degree to which the response completely, thoroughly, and accurately addresses the question."
#ZERO_TO_ONE_SCORE = "The score. Values range from 0 (low) to 1 (high), in increments of 0.25 (where 0 is unattempted or blank, 0.5 is incorrect or incomplete, 0.75 is good, and 1.0 is great / thorough). Indicates the degree to which the response completely, thoroughly, and accurately addresses the question."
ZERO_TO_ONE_SCORE = "The score. Values generally range from 0.0 (low) to 1.0 (high), although it is possible for score to be greater than 1.0."
ONE_TO_FIVE_SCORE = "The score. Values generally range from 1.0 (low) to 5.0 (high)."

class Student(BaseModel):
"""A student."""
Expand Down
76 changes: 58 additions & 18 deletions app/submissions_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from app.prompts.homework_4 import HOMEWORK_QUESTIONS
from app.submissions_retriever import SubmissionsRetriever, UNIQUE_ONLY, DOCS_LIMIT
from app.openai_llm import create_llm, MODEL_NAME, TEMP
from app.response_formatters import Student, QuestionScoring
from app.response_formatters import Student, QuestionScoring, ZERO_TO_ONE_SCORE, COMMENTS, CONFIDENCE_SCORE


def get_relevant_docs(retriever, query, verbose=True):
Expand All @@ -30,25 +30,22 @@ def get_relevant_docs(retriever, query, verbose=True):
return relevant_docs



SYSTEM_INSTRUCTIONS = """You are an experienced machine learning practitioner and instructor. Your goal is to grade a student's machine learning homework assignment. Provide a score (and corresponding comments) to indicate how completely and accurately the student addressed the following question:"""

QA_CONTEXT_TEMPLATE = """Answer the **query**, based on the provided **context**, and format your response according to the **formatting instructions** (avoid using special characters).
**Context**: {context}
**Query**: {query}
**Formatting Instructions**: {formatting_instructions}
"""


#QA_CONTEXT_TEMPLATE = """Answer the **query**, based on the provided **context**.
#
#**Context**: {context}
#
#**Query**: {query}
#"""

QA_CONTEXT_TEMPLATE = """Answer the **query**, based only on the provided **context**, and format your response according to the **formatting instructions** (avoid using special characters).
**Query**: {query}
**Context**: {context}
**Formatting Instructions**: {formatting_instructions}
"""

def qa_chain(llm, query, compression_retriever, parser_class, verbose=False):
# https://www.youtube.com/watch?v=yriZBFKE9JU
Expand All @@ -66,6 +63,51 @@ def qa_chain(llm, query, compression_retriever, parser_class, verbose=False):
return parsed_response


#QUESTION_SCORING_INSTRUCTIONS = f"""
#You are a helpful and experienced machine learning practitioner and instructor (i.e. the "grading assistant").
#Your goal is to accurately grade a student's machine learning homework assignment.
#You will be provided a question, and your task is to provide a score and corresponding comment,
#based on some provided context about the student's response.
#
# + What 'score' would you give the response for this question? {ZERO_TO_ONE_SCORE}
#
# + And why (i.e. your 'comments')? {COMMENTS}
#
# + And how sure are you about this score (i.e. your 'confidence'), as a percentage between 0 (low confidence) and 1 (high confidence)? {CONFIDENCE_SCORE}
#
#NOTE: It is important to grade accurately and fairly, so if you don't know, we'd rather you provide a low confidence, and a low score, and a comment saying you're not sure.
#
#NOTE: If you don't have any context, or if you don't think the context is relevant enough, you can provide a zero.
#"""


QUESTION_SCORING_INSTRUCTIONS = f"""
You are an experienced machine learning practitioner and instructor (i.e. the Grader).
Your goal is to accurately grade a student's machine learning homework assignment.
You will be provided a question that the student was supposed to answer,
and your task is to grade how well the student answered that question,
based only on some context provided about the student's response.
Grading Guidance:
+ What 'score' would you give the response for this question?
If you don't have any context, or if you don't think the context is relevant enough, you should assign a score of 0.
If the student's response was off-topic, not specific enough, or not what the question is looking for, you should assign a score of 0.5.
If the response was generally good, but there were some minor issue(s), you should assign a score of 0.75.
If the response was relevant and correct, you should assign a score of 1.0.
If the response was relevant and correct, and very thorough and detailed, you should assign a score of 1.25.
+ How certain are you about this score (i.e. your 'confidence')?
+ And why (i.e. your 'comments' about the score and/or the confidence)?
You should provide specific justification for the score.
You should cite specific content present or absent from the response, as well as your reasoning for providing the score.
REMEMBER: It is very important to grade accurately, so it is imperative that you only grade based on the provided context,
and you will prefer to give low confidence and a corresponding comment if you're not sure or if you don't have the context you need.
"""


class SubmissionsGrader(SubmissionsRetriever):

def __init__(self, unique_only=UNIQUE_ONLY, similarity_threshold=SIMILARITY_THRESHOLD, docs_limit=DOCS_LIMIT,
Expand Down Expand Up @@ -121,11 +163,13 @@ def perform(self):

record = {"filename": filename, "file_id": dp.file_id} # flattened structure, one row per submission document
try:

student = qa_chain(llm=self.llm, query=STUDENT_QUERY, compression_retriever=compression_retriever, parser_class=Student)
record = {**record, **{"student_id": student.net_id, "student_name": student.name}}

i = 1
for query_id, query in self.homework_questions:
query = f"{QUESTION_SCORING_INSTRUCTIONS} {query}"
scoring = qa_chain(llm=self.llm, query=query, compression_retriever=compression_retriever, parser_class=QuestionScoring)
record[f"scoring_{i}_question_id"] = scoring.question_id
record[f"scoring_{i}_score"] = scoring.score
Expand Down Expand Up @@ -154,13 +198,9 @@ def perform(self):
#self.errors_df.to_csv(self.errors_csv_filepath, index=False)






if __name__ == "__main__":



grader = SubmissionsGrader()
grader.perform()

Expand Down

0 comments on commit 50498ec

Please sign in to comment.