first commit

ivanbelenky · Jul 3, 2023 · aac2c01 · aac2c01
commit aac2c01
Show file tree

Hide file tree

Showing 8 changed files with 587 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+## `jsonllm`
+
+```python
+from dateutil.parser import parse
+from dataclasses import dataclass
+
+import jsonllm
+from jsonllm import ToParse, parse_field
+
+jsonllm.config('openai', 'sk-123')
+#jsonllm.config('google', 'project_id', 'us-central1')
+
+@dataclass
+class Address:
+    street: ToParse=parse_field(types=str, needed=True,
+                                instructions='Find the street, if not found input 123 Main St')
+    city: ToParse=parse_field(types=str, needed=True,
+                              instructions='Find the city, if not found input New York')
+
+@dataclass
+class Person:
+    first_name: ToParse=parse_field(name='first_name', types=str, needed=True,
+                                    instructions='Find the first name, if not found input John')
+    last_name: ToParse=parse_field(name='last_name', types=str, needed=True,
+                                   instructions='Find the last name, if not found input Doe')
+    address: ToParse=parse_field(output_name='address', types=Address, needed=True)
+    age: ToParse=parse_field(output_name='age', types=(int, float), valid=lambda x: x > 0, default=0)
+    date_of_birth: ToParse=parse_field(output_name='dob', types=str, caster=lambda dob: parse(dob).date(),
+                                        instructions='Find the date of birth and cast it to isoformat')
+
+
+message = 'My name is John Connor, I think I was born 0 of Unix time.'
+response = jsonllm.loads(message, schema=Person, completion='openai')
+print(response)```
diff --git a/jsonllm/__init__.py b/jsonllm/__init__.py
@@ -0,0 +1,10 @@
+from .schema import (
+    SchemaError,
+    ToParse,
+    parse_field,
+)
+
+from .jsonllm import (
+    ParsedResponse,
+    loads,
+)
diff --git a/jsonllm/completions.py b/jsonllm/completions.py
@@ -0,0 +1,123 @@
+import os
+
+import openai
+import vertexai
+from vertexai.preview.language_models import ChatModel, TextGenerationModel
+
+from jsonllm.utils import no_tokens, OpenAIErrors
+from jsonllm.constants import MAX_TOKENS
+
+
+class Completion:
+    '''Completion strategies'''
+    class ClientError(Exception):
+        pass
+
+    class ServerError(Exception):
+        pass
+
+    @staticmethod
+    def _openai(prompt: str,
+                *,
+                temperature:float=0.0, 
+                top_p:float=1.0,
+                top_k:int=0,
+                model='gpt-3.5-turbo',
+                ):
+        #openai.api_key = os.environ.get('OPENAI_API_KEY')
+        try:
+            tokens_left = MAX_TOKENS[model] - Completion.no_tokens(prompt)
+            if tokens_left < 0:
+                raise Exception(
+                    f"Failed to complete prompt, not enough tokens left "
+                    f"try reducing prompt length: {tokens_left}")
+
+            if 'gpt-3.5-turbo' in model:
+                completion = openai.ChatCompletion.create(model=model, messages=[{'role':'user','content':prompt}], temperature=temperature)
+                raw_response = completion.choices[0].message.content
+            elif any([m in model for m in ['ada', 'babbage', 'curie', 'davinci']]):
+                completion = openai.Completion.create(model=model ,prompt=prompt, temperature=temperature)
+                raw_response = completion.choices[0].text
+            return raw_response
+        except OpenAIErrors as e:
+            raise Completion.ServerError(f"Failed to complete prompt: {e}")
+        except Exception as e:
+            raise Completion.ClientError(f"Failed to complete prompt: {e}")
+
+    @staticmethod
+    def _google(prompt: str, temperature: float = 0.0, model='chat-bison@001'):
+        '''Usable models (tested) are: 
+        - chat-bison@001
+        - †ext-bison@001
+        '''
+        try:
+            project_id = "trbs-dev" if os.environ.get('TRUCKBASE_SERVER') in ['DEVELOPMENT', 'LOCAL'] else "trbs-prod"
+            location = "us-central1"
+            parameters = {
+                    "temperature": temperature,
+                    "max_output_tokens": 1024,
+                    "top_p": 0.8,
+                    "top_k": 40,
+                }
+
+            vertexai.init(project=project_id, location=location)
+            if 'chat' in model:
+                chat_model = ChatModel.from_pretrained(model)
+                chat = chat_model.start_chat(examples=[])
+                response=chat.send_message(prompt, **parameters)
+                raw_response = response.text
+            elif 'text' in model:
+                text_model = TextGenerationModel.from_pretrained(model)
+                response = text_model.predict(prompt, **parameters)
+                raw_response = response.text
+
+            return raw_response
+        except Exception as e:
+            raise Completion.ClientError(f"Failed to complete prompt: {e}")
+
+    @staticmethod
+    def complete_prompt(prompt: str,
+                        llm: str,
+                        model: str,
+                        *,
+                        temperature: float = 0.0,
+                        ):
+        '''Completes prompt using specified strategy.
+
+        Parameters
+        ----------
+        prompt : str
+            prompt to complete
+        llm : str, optional
+            language model to use, by default 'openai', one of ['openai', 'llama', 'bard']
+        max_tokens : int, optional
+            max tokens to use for completion, by default MAX_TOKENS
+        temperature : float, optional
+            temperature defines the randomness of the predictions, the higher the more random, by default 0.0
+            particular care must be taken with different models since they establish different ranges for temperature
+            - openai: [0.0, 2.0]
+            - llama: NotImplemented
+            - bard: NotImplemented
+        model : str, optional
+            - openai: https://beta.openai.com/docs/api-reference/completions/create
+            - llama: NotImplemented
+            - bard: NotImplemented
+
+        Returns
+        -------
+        raw_response: str
+            raw response from completion
+
+        Raises
+        ------
+        ClientError
+            raised when completion fails on the client side, e.g. not enough tokens, wrong api key, etc.
+        ServerError
+            raised when completion fails on the side of the LLM provider
+        '''
+        if llm == 'openai':
+            return Completion.openai(prompt, temperature=temperature, model=model)
+        elif llm == 'google':
+            return Completion.google_completion(prompt, temperature=temperature, model=model)
+        else:
+            raise NotImplementedError(f"Completion model {llm} not implemented")
diff --git a/jsonllm/constants.py b/jsonllm/constants.py
@@ -0,0 +1,30 @@
+REPLACEMENTS = [
+    {},
+    {
+    '\'': '"',
+    ' ': '',
+    '\n': ''
+}]
+
+DEFAULT_REPLACE = REPLACEMENTS[1]
+
+REGEX_PATTERNS = [
+    r'{(?:[^{}]|(?R))*}',
+    r'{[^{}]*}',
+    r'\[[^\[\]]*\]',
+]
+
+
+MAX_TOKENS = {
+    'chat-bison@001': -1, #TODO: check limits
+    'ada': 2048,
+    'babbage': 2048,
+    'curie': 2048,
+    'gpt-3.5-turbo': 4096,
+}
+
+
+EXAMPLES_PROMPTS = [
+    'Examples: <examples>',
+]
+