From 8fd315d26466004a79d8b6ea5c9a7c5f3238a26c Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 3 Jul 2023 21:28:34 +0200
Subject: [PATCH] Implemented logging token usage (solves #322) (#438)

* Implemented logging token usage

Token usage is now tracked and logged into memory/logs/token_usage

* Step names are now inferred from function name

* Incorporated Anton's feedback

- Made LogUsage a dataclass
- For token logging, step name is now inferred via inspect module

* Formatted (black/ruff)

* Update gpt_engineer/ai.py

Co-authored-by: Anton Osika <anton.osika@gmail.com>

* formatting

---------

Co-authored-by: Anton Osika <anton.osika@gmail.com>
---
 gpt_engineer/ai.py    | 92 +++++++++++++++++++++++++++++++++++++++++--
 gpt_engineer/main.py  |  2 +
 gpt_engineer/steps.py | 31 +++++++++------
 pyproject.toml        |  1 +
 4 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/gpt_engineer/ai.py b/gpt_engineer/ai.py
index 6430c3727e..e5156e97ad 100644
--- a/gpt_engineer/ai.py
+++ b/gpt_engineer/ai.py
@@ -2,25 +2,54 @@
 
 import logging
 
+from dataclasses import dataclass
 from typing import Dict, List
 
 import openai
+import tiktoken
 
 logger = logging.getLogger(__name__)
 
 
+@dataclass
+class TokenUsage:
+    step_name: str
+    in_step_prompt_tokens: int
+    in_step_completion_tokens: int
+    in_step_total_tokens: int
+    total_prompt_tokens: int
+    total_completion_tokens: int
+    total_tokens: int
+
+
 class AI:
     def __init__(self, model="gpt-4", temperature=0.1):
         self.temperature = temperature
         self.model = model
 
-    def start(self, system, user):
+        # initialize token usage log
+        self.cumulative_prompt_tokens = 0
+        self.cumulative_completion_tokens = 0
+        self.cumulative_total_tokens = 0
+        self.token_usage_log = []
+
+        try:
+            self.tokenizer = tiktoken.encoding_for_model(model)
+        except KeyError:
+            logger.debug(
+                f"Tiktoken encoder for model {model} not found. Using "
+                "cl100k_base encoder instead. The results may therefore be "
+                "inaccurate and should only be used as estimate."
+            )
+            self.tokenizer = tiktoken.get_encoding("cl100k_base")
+
+    def start(self, system, user, step_name):
         messages = [
             {"role": "system", "content": system},
             {"role": "user", "content": user},
         ]
 
-        return self.next(messages)
+        return self.next(messages, step_name=step_name)
 
     def fsystem(self, msg):
         return {"role": "system", "content": msg}
@@ -31,7 +60,7 @@ def fuser(self, msg):
     def fassistant(self, msg):
         return {"role": "assistant", "content": msg}
 
-    def next(self, messages: List[Dict[str, str]], prompt=None):
+    def next(self, messages: List[Dict[str, str]], prompt=None, *, step_name=None):
         if prompt:
             messages += [{"role": "user", "content": prompt}]
 
@@ -52,8 +81,65 @@ def next(self, messages: List[Dict[str, str]], prompt=None):
         print()
         messages += [{"role": "assistant", "content": "".join(chat)}]
         logger.debug(f"Chat completion finished: {messages}")
+
+        self.update_token_usage_log(
+            messages=messages, answer="".join(chat), step_name=step_name
+        )
+
         return messages
 
+    def update_token_usage_log(self, messages, answer, step_name):
+        prompt_tokens = self.num_tokens_from_messages(messages)
+        completion_tokens = self.num_tokens(answer)
+        total_tokens = prompt_tokens + completion_tokens
+
+        self.cumulative_prompt_tokens += prompt_tokens
+        self.cumulative_completion_tokens += completion_tokens
+        self.cumulative_total_tokens += total_tokens
+
+        self.token_usage_log.append(
+            TokenUsage(
+                step_name=step_name,
+                in_step_prompt_tokens=prompt_tokens,
+                in_step_completion_tokens=completion_tokens,
+                in_step_total_tokens=total_tokens,
+                total_prompt_tokens=self.cumulative_prompt_tokens,
+                total_completion_tokens=self.cumulative_completion_tokens,
+                total_tokens=self.cumulative_total_tokens,
+            )
+        )
+
+    def format_token_usage_log(self):
+        result = "step_name,"
+        result += "prompt_tokens_in_step,completion_tokens_in_step,total_tokens_in_step"
+        result += ",total_prompt_tokens,total_completion_tokens,total_tokens\n"
+        for log in self.token_usage_log:
+            result += log.step_name + ","
+            result += str(log.in_step_prompt_tokens) + ","
+            result += str(log.in_step_completion_tokens) + ","
+            result += str(log.in_step_total_tokens) + ","
+            result += str(log.total_prompt_tokens) + ","
+            result += str(log.total_completion_tokens) + ","
+            result += str(log.total_tokens) + "\n"
+        return result
+
+    def num_tokens(self, txt):
+        return len(self.tokenizer.encode(txt))
+
+    def num_tokens_from_messages(self, messages):
+        """Returns the number of tokens used by a list of messages."""
+        n_tokens = 0
+        for message in messages:
+            n_tokens += (
+                4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+            )
+            for key, value in message.items():
+                n_tokens += self.num_tokens(value)
+                if key == "name":  # if there's a name, the role is omitted
+                    n_tokens += -1  # role is always required and always 1 token
+        n_tokens += 2  # every reply is primed with <im_start>assistant
+        return n_tokens
+
 
 def fallback_model(model: str) -> str:
     try:
diff --git a/gpt_engineer/main.py b/gpt_engineer/main.py
index e81de45f70..ae384a9777 100644
--- a/gpt_engineer/main.py
+++ b/gpt_engineer/main.py
@@ -61,6 +61,8 @@ def main(
     if collect_consent():
         collect_learnings(model, temperature, steps, dbs)
 
+    dbs.logs["token_usage"] = ai.format_token_usage_log()
+
 
 if __name__ == "__main__":
     app()
diff --git a/gpt_engineer/steps.py b/gpt_engineer/steps.py
index 7a012f4b74..304d6467d1 100644
--- a/gpt_engineer/steps.py
+++ b/gpt_engineer/steps.py
@@ -1,3 +1,4 @@
+import inspect
 import json
 import re
 import subprocess
@@ -35,12 +36,17 @@ def get_prompt(dbs: DBs) -> str:
     return dbs.input["prompt"]
 
 
+def curr_fn() -> str:
+    """Get the name of the current function"""
+    return inspect.stack()[1].function
+
+
 # All steps below have the signature Step
 
 
 def simple_gen(ai: AI, dbs: DBs) -> List[dict]:
     """Run the AI on the main prompt and save the results"""
-    messages = ai.start(setup_sys_prompt(dbs), get_prompt(dbs))
+    messages = ai.start(setup_sys_prompt(dbs), get_prompt(dbs), step_name=curr_fn())
     to_files(messages[-1]["content"], dbs.workspace)
     return messages
 
@@ -52,7 +58,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
     messages = [ai.fsystem(dbs.preprompts["qa"])]
     user_input = get_prompt(dbs)
     while True:
-        messages = ai.next(messages, user_input)
+        messages = ai.next(messages, user_input, step_name=curr_fn())
 
         if messages[-1]["content"].strip() == "Nothing more to clarify.":
             break
@@ -71,6 +77,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
             messages = ai.next(
                 messages,
                 "Make your own assumptions and state them explicitly before starting",
+                step_name=curr_fn(),
             )
             print()
             return messages
@@ -97,7 +104,7 @@ def gen_spec(ai: AI, dbs: DBs) -> List[dict]:
         ai.fsystem(f"Instructions: {dbs.input['prompt']}"),
     ]
 
-    messages = ai.next(messages, dbs.preprompts["spec"])
+    messages = ai.next(messages, dbs.preprompts["spec"], step_name=curr_fn())
 
     dbs.memory["specification"] = messages[-1]["content"]
 
@@ -108,7 +115,7 @@ def respec(ai: AI, dbs: DBs) -> List[dict]:
     messages = json.loads(dbs.logs[gen_spec.__name__])
     messages += [ai.fsystem(dbs.preprompts["respec"])]
 
-    messages = ai.next(messages)
+    messages = ai.next(messages, step_name=curr_fn())
     messages = ai.next(
         messages,
         (
@@ -119,6 +126,7 @@ def respec(ai: AI, dbs: DBs) -> List[dict]:
             "If you are satisfied with the specification, just write out the "
             "specification word by word again."
         ),
+        step_name=curr_fn(),
     )
 
     dbs.memory["specification"] = messages[-1]["content"]
@@ -135,7 +143,7 @@ def gen_unit_tests(ai: AI, dbs: DBs) -> List[dict]:
         ai.fuser(f"Specification:\n\n{dbs.memory['specification']}"),
     ]
 
-    messages = ai.next(messages, dbs.preprompts["unit_tests"])
+    messages = ai.next(messages, dbs.preprompts["unit_tests"], step_name=curr_fn())
 
     dbs.memory["unit_tests"] = messages[-1]["content"]
     to_files(dbs.memory["unit_tests"], dbs.workspace)
@@ -145,13 +153,12 @@ def gen_unit_tests(ai: AI, dbs: DBs) -> List[dict]:
 
 def gen_clarified_code(ai: AI, dbs: DBs) -> List[dict]:
     """Takes clarification and generates code"""
-
     messages = json.loads(dbs.logs[clarify.__name__])
 
     messages = [
         ai.fsystem(setup_sys_prompt(dbs)),
     ] + messages[1:]
-    messages = ai.next(messages, dbs.preprompts["use_qa"])
+    messages = ai.next(messages, dbs.preprompts["use_qa"], step_name=curr_fn())
 
     to_files(messages[-1]["content"], dbs.workspace)
     return messages
@@ -159,14 +166,13 @@ def gen_clarified_code(ai: AI, dbs: DBs) -> List[dict]:
 
 def gen_code(ai: AI, dbs: DBs) -> List[dict]:
     # get the messages from previous step
-
     messages = [
         ai.fsystem(setup_sys_prompt(dbs)),
         ai.fuser(f"Instructions: {dbs.input['prompt']}"),
         ai.fuser(f"Specification:\n\n{dbs.memory['specification']}"),
         ai.fuser(f"Unit tests:\n\n{dbs.memory['unit_tests']}"),
     ]
-    messages = ai.next(messages, dbs.preprompts["use_qa"])
+    messages = ai.next(messages, dbs.preprompts["use_qa"], step_name=curr_fn())
     to_files(messages[-1]["content"], dbs.workspace)
     return messages
 
@@ -224,6 +230,7 @@ def gen_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
             "if necessary.\n"
         ),
         user="Information about the codebase:\n\n" + dbs.workspace["all_output.txt"],
+        step_name=curr_fn(),
     )
     print()
 
@@ -240,7 +247,7 @@ def use_feedback(ai: AI, dbs: DBs):
         ai.fassistant(dbs.workspace["all_output.txt"]),
         ai.fsystem(dbs.preprompts["use_feedback"]),
     ]
-    messages = ai.next(messages, dbs.input["feedback"])
+    messages = ai.next(messages, dbs.input["feedback"], step_name=curr_fn())
     to_files(messages[-1]["content"], dbs.workspace)
     return messages
 
@@ -253,7 +260,9 @@ def fix_code(ai: AI, dbs: DBs):
         ai.fuser(code_output),
         ai.fsystem(dbs.preprompts["fix_code"]),
     ]
-    messages = ai.next(messages, "Please fix any errors in the code above.")
+    messages = ai.next(
+        messages, "Please fix any errors in the code above.", step_name=curr_fn()
+    )
     to_files(messages[-1]["content"], dbs.workspace)
     return messages
 
diff --git a/pyproject.toml b/pyproject.toml
index d6a00f05b9..c61ff08346 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
   'typer >= 0.3.2',
   'rudder-sdk-python == 2.0.2',
   'dataclasses-json == 0.5.7',
+  'tiktoken',
   'tabulate == 0.9.0',
 ]