Cumulative fixes for AI features (#187)

openvmp · Sep 23, 2024 · c43e380 · c43e380
1 parent 26a0b85
commit c43e380
Show file tree

Hide file tree

Showing 7 changed files with 374 additions and 98 deletions.
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
@@ -442,13 +442,30 @@ Generate OpenSCAD, CadQuery or build123d scripts with Generative AI using the fo
     <part name>:
       type: <ai-openscad|ai-cadquery|ai-build123d>
       provider: <google|openai|ollama, the model provider to use>
+      model: <(optional) the model to use>
       tokens: <(optional) the limit of token context>
       temperature: <(optional) the temperature LLM parameter>
       top_p: <(optional) the top_p LLM parameter>
       top_k: <(optional, openai|ollama) the top_k LLM parameter>
       images: <(optional) contextual images as input for AI>
         - <image path>
 
+The following models are recommended for use:
+
++----------+----------------------------+
+| Provider | Model                      |
++==========+============================+
+| google   | - gemini-1.5-pro (default) |
+|          | - gemini-1.5-flash         |
++----------+----------------------------+
+| openai   | - gpt-4o (default)         |
+|          | - gpt-4o-mini              |
++----------+----------------------------+
+| ollama   | - llama-3.1:8b             |
+|          | - llama-3.1:70b (default)  |
+|          | - llama-3.1:405b           |
++----------+----------------------------+
+
 +---------------------------+-------------------------------------------------------------------------------------------------------------------------+
 | Example                   | Result                                                                                                                  |
 +===========================+=========================================================================================================================+

diff --git a/partcad/src/partcad/ai.py b/partcad/src/partcad/ai.py
@@ -24,6 +24,7 @@
     "gpt-4-vision-preview",
     "gpt-4o",
     "gpt-4o-mini",
+    "o1-*",
     "gemini-pro",
     "gemini-pro-vision",
     "gemini-1.5-pro",
@@ -46,12 +47,15 @@ def generate(
         prompt: str,
         config: dict[str, str],
         num_options: int = 1,
-        image_filenames: list[str] = [],
     ):
         with pc_logging.Action("Ai" + action, package, item):
             # Determine the model to use
             provider = config.get("provider", None)
-            if "model" in config and config["model"] is not None:
+            if (
+                "model" in config
+                and config["model"] is not None
+                and config["model"] != ""
+            ):
                 model = config["model"]
             else:
                 if provider is None:
@@ -68,9 +72,9 @@ def generate(
                     model = "gemini-1.5-pro"
                 elif provider == "openai":
                     # if len(image_filenames) > 0:
-                    #     model = "gpt-4-vision-preview"
+                    #     model = "gpt-4o"
                     # else:
-                    #     model = "gpt-4"
+                    #     model = "gpt-4o"
                     model = "gpt-4o"
                 elif provider == "ollama":
                     model = "llama3.1:70b"
@@ -96,7 +100,6 @@ def generate(
                     result = self.generate_google(
                         model,
                         prompt,
-                        image_filenames,
                         config,
                         num_options,
                     )
@@ -111,7 +114,6 @@ def generate(
                     result = self.generate_openai(
                         model,
                         prompt,
-                        image_filenames,
                         config,
                         num_options,
                     )
@@ -126,7 +128,6 @@ def generate(
                     result = self.generate_ollama(
                         model,
                         prompt,
-                        image_filenames,
                         config,
                         num_options,
                     )

diff --git a/partcad/src/partcad/ai_google.py b/partcad/src/partcad/ai_google.py
@@ -8,6 +8,7 @@
 #
 
 import importlib
+import re
 import threading
 import time
 from typing import Any
@@ -68,7 +69,6 @@ def generate_google(
         self,
         model: str,
         prompt: str,
-        image_filenames: list[str] = [],
         config: dict[str, Any] = {},
         options_num: int = 1,
     ):
@@ -95,13 +95,21 @@ def generate_google(
         else:
             temperature = None
 
-        images = list(
-            map(
-                lambda f: pil_image.open(f),
-                image_filenames,
-            )
-        )
-        contents = [prompt, *images]
+        image_content = []
+
+        def insert_image(match):
+            filename = match.group(1)
+            image_content.append(pil_image.open(filename))
+            return "IMAGE_INSERTED_HERE"
+
+        prompt = re.sub(r"INSERT_IMAGE_HERE\(([^)]*)\)", insert_image, prompt)
+        text_content = prompt.split("IMAGE_INSERTED_HERE")
+
+        content = []
+        for i in range(len(text_content)):
+            content.append(text_content[i])
+            if i < len(image_content):
+                content.append(image_content[i])
 
         client = google_genai.GenerativeModel(
             model,
@@ -121,9 +129,7 @@ def generate_google(
             while retry == True:
                 retry = False
                 try:
-                    response = client.generate_content(
-                        contents,
-                    )
+                    response = client.generate_content(content)
                 except google_api_core_exceptions.ResourceExhausted as e:
                     pc_logging.exception(e)
                     retry = True

diff --git a/partcad/src/partcad/ai_ollama.py b/partcad/src/partcad/ai_ollama.py
@@ -58,7 +58,6 @@ def generate_ollama(
         self,
         model: str,
         prompt: str,
-        image_filenames: list[str] = [],
         config: dict[str, Any] = {},
         options_num: int = 1,
     ):
@@ -71,7 +70,7 @@ def generate_ollama(
         if not ollama_once():
             return None
 
-        if len(image_filenames) > 0:
+        if "INSERT_IMAGE_HERE" in prompt:
             raise NotImplementedError("Images are not supported by Ollama")
 
         if "tokens" in config:

diff --git a/partcad/src/partcad/ai_openai.py b/partcad/src/partcad/ai_openai.py
@@ -11,6 +11,7 @@
 import importlib
 import mimetypes
 from pathlib import Path
+import re
 import threading
 from typing import Any
 
@@ -30,8 +31,8 @@
     "gpt-3.5-turbo": 4096,
     "gpt-4": 8000,  # 32600,
     "gpt-4-vision-preview": 8192,
-    "gpt-4o": 4096,  # 32600,
-    "gpt-4o-mini": 8000,  # 32600,
+    "gpt-4o": 16000,  # 32600,
+    "gpt-4o-mini": 16000,  # 32600,
 }
 
 
@@ -61,7 +62,6 @@ def generate_openai(
         self,
         model: str,
         prompt: str,
-        image_filenames: list[str] = [],
         config: dict[str, Any] = {},
         options_num: int = 1,
     ):
@@ -83,29 +83,45 @@ def generate_openai(
         else:
             temperature = None
 
-        content = [
-            {"type": "text", "text": prompt},
-            *list(
-                map(
-                    lambda f: {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "data:%s;base64,%s"
-                            % (
-                                mimetypes.guess_type(f, False)[0],
-                                base64.b64encode(Path(f).read_bytes()).decode(),
-                            ),
-                            "detail": "high",
-                        },
+        pc_logging.debug("Prompt: %s", prompt)
+
+        image_content = []
+
+        def insert_image(match):
+            filename = match.group(1)
+            image_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "data:%s;base64,%s"
+                        % (
+                            mimetypes.guess_type(filename, False)[0],
+                            base64.b64encode(
+                                Path(filename).read_bytes()
+                            ).decode(),
+                        ),
+                        "detail": "high",
                     },
-                    image_filenames,
-                )
-            ),
-        ]
+                }
+            )
+            return "IMAGE_INSERTED_HERE"
+
+        prompt = re.sub(r"INSERT_IMAGE_HERE\(([^)]*)\)", insert_image, prompt)
+        text_content = list(
+            map(
+                lambda prompt_section: {"type": "text", "text": prompt_section},
+                prompt.split("IMAGE_INSERTED_HERE"),
+            )
+        )
+
+        content = []
+        for i in range(len(text_content)):
+            content.append(text_content[i])
+            if i < len(image_content):
+                content.append(image_content[i])
 
         cc = openai_client.chat.completions.create(
             messages=[
-                {"role": "system", "content": "You are a mechanical engineer"},
                 {"role": "user", "content": content},
             ],
             stream=False,