From 9c56d6f49dcd62a6c745ef688cc34cf1f81ed2ae Mon Sep 17 00:00:00 2001
From: MJ Rossetti <s2t2@users.noreply.github.com>
Date: Wed, 27 Dec 2023 14:50:05 -0500
Subject: [PATCH 1/5] Translate code from llama notebook

---
 README.md            | 16 +++++++++-
 app/llama_chain.py   | 72 ++++++++++++++++++++++++++++++++++++++++++++
 app/llama_llm.py     | 60 ++++++++++++++++++++++++++++++++++++
 app/llama_prompts.py | 38 +++++++++++++++++++++++
 requirements.txt     |  6 ++++
 5 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 app/llama_chain.py
 create mode 100644 app/llama_llm.py
 create mode 100644 app/llama_prompts.py

diff --git a/README.md b/README.md
index e7ea484..d146a84 100644
--- a/README.md
+++ b/README.md
@@ -42,10 +42,22 @@ Setup submission files:
 3. Move a copy of the starter notebook (which contains instructions and some starer code) into the submissions directory, and note the filename (i.e. `STARTER_FILENAME`).
 
 
-### OpenAI Setup
+### LLM Setup
+
+Choose an LLM provider (OpenAI or Meta Llama). OpenAI might be easier to get started, but costs money. Whereas Meta Llama is free, and for this reason is the recommended LLM provider. Based on your chosen LLM provider, see the corresponding setup instructions below.
+
+#### OpenAI Setup
 
 Obtain an OpenAI API Key (i.e. `OPENAI_API_KEY`).
 
+#### Llama Setup
+
+See: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+
+First, visit the [Meta Llama website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/), fill out the request form, and wait until your request is accepted.
+
+Then, create a [Hugging Face account](https://huggingface.co) (using the same email address from step 1), and obtain a [user access token](https://huggingface.co/docs/hub/security-tokens) (i.e. `HUGGING_FACE_TOKEN`).
+
 
 ### Environment Variables Setup
 
@@ -55,6 +67,8 @@ Create ".env" file and set environment variables:
 # this is the ".env" file...
 
 OPENAI_API_KEY="sk-..."
+# or:
+HUGGINGFACE_TOKEN="..."
 
 SUBMISSIONS_DIRPATH="/Users/USERNAME/Desktop/GRADING HW 4"
 STARTER_FILENAME="Homework_X_STARTER.ipynb"
diff --git a/app/llama_chain.py b/app/llama_chain.py
new file mode 100644
index 0000000..ab5b2fa
--- /dev/null
+++ b/app/llama_chain.py
@@ -0,0 +1,72 @@
+# adapted from youtube video about llama and langchain: ________________
+
+
+import os
+from dotenv import load_dotenv
+
+from langchain import HuggingFacePipeline
+from langchain import PromptTemplate,  LLMChain
+
+from app.llama_prompts import get_prompt, parse_text
+from app.llama_llm import LlamaService
+
+
+load_dotenv()
+
+TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}
+
+
+if __name__ == "__main__":
+
+    service = LlamaService()
+    pipeline = service.pipeline
+    llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={"temperature":TEMP})
+    print(llm)
+
+    # SIMPLE LLM CHAIN
+
+    system_prompt = "You are an advanced assistant that excels at translation. "
+    instruction = "Convert the following text from English to French:\n\n {text}"
+    template = get_prompt(instruction, system_prompt)
+    print(template)
+    prompt = PromptTemplate(template=template, input_variables=["text"])
+
+    llm_chain = LLMChain(prompt=prompt, llm=llm)
+
+    query = "how are you today?"
+    response = llm_chain.run(query)
+    parse_text(response)
+
+
+    # CHAT CHAIN
+
+    if input("Continue to chat (Y/N): ").upper() != "Y":
+        exit()
+
+
+    from langchain.memory import ConversationBufferMemory
+    from langchain import LLMChain, PromptTemplate
+
+    prompt = PromptTemplate(template=template, input_variables=["chat_history", "user_input"])
+    memory = ConversationBufferMemory(memory_key="chat_history")
+
+
+
+    # for chat, with memory
+    instruction = "Chat History:\n\n{chat_history} \n\nUser: {user_input}"
+    system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context"
+
+    template = get_prompt(instruction, system_prompt)
+    print(template)
+
+    llm_chain = LLMChain(prompt=prompt, llm=llm,
+        verbose=True, memory=memory,
+    )
+
+    query = ""
+    while query != "":
+        query = input("Please ask a question: ")
+        print(query)
+
+        response = llm_chain.predict(user_input=query)
+        print(response)
diff --git a/app/llama_llm.py b/app/llama_llm.py
new file mode 100644
index 0000000..bc8c5d2
--- /dev/null
+++ b/app/llama_llm.py
@@ -0,0 +1,60 @@
+
+# adapted from youtube video about llama and langchain: ________________
+
+import os
+from dotenv import load_dotenv
+
+import torch
+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+
+from app.llama_prompts import get_prompt, cut_off_text, remove_substring
+
+load_dotenv()
+
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+
+MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
+
+
+class LlamaService:
+    def __init__(self, model_name=MODEL_NAME, hf_token=HUGGINGFACE_TOKEN):
+        self.model_name = model_name
+        self.hf_token = hf_token
+
+    @property
+    def tokenizer(self):
+        # https://huggingface.co/transformers/v2.11.0/model_doc/auto.html?highlight=autotokenizer#autotokenizer
+        return AutoTokenizer.from_pretrained(self.model_name, token=self.hf_token)
+
+    @property
+    def model(self):
+        # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM
+        return AutoModelForCausalLM.from_pretrained(self.model_name, token=self.hf_token,
+                                                    device_map='auto', torch_dtype=torch.float16,
+        )
+
+    @property
+    def pipeline(self):
+        # https://huggingface.co/docs/transformers/main_classes/pipelines
+        return pipeline(task="text-generation", model=self.model, tokenizer= self.tokenizer,
+                        device_map="auto", torch_dtype=torch.bfloat16,
+                        max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
+                        eos_token_id=self.tokenizer.eos_token_id
+        )
+
+
+    def generate(self, text):
+        prompt = get_prompt(text)
+        with torch.autocast('cuda', dtype=torch.bfloat16):
+            inputs = self.tokenizer(prompt, return_tensors="pt").to('cuda')
+            outputs = self.model.generate(**inputs,
+                                    max_new_tokens=512,
+                                    eos_token_id=self.tokenizer.eos_token_id,
+                                    pad_token_id=self.tokenizer.eos_token_id,
+            )
+            final_outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+            final_outputs = cut_off_text(final_outputs, '</s>')
+            final_outputs = remove_substring(final_outputs, prompt)
+
+        return final_outputs#, outputs
diff --git a/app/llama_prompts.py b/app/llama_prompts.py
new file mode 100644
index 0000000..a45cf4c
--- /dev/null
+++ b/app/llama_prompts.py
@@ -0,0 +1,38 @@
+
+# adapted from youtube video about llama and langchain: ________________
+
+#import json
+import textwrap
+
+B_INST, E_INST = "[INST]", "[/INST]"
+
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+# TODO: refactor
+
+def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
+    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
+    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
+    return prompt_template
+
+def cut_off_text(text, prompt):
+    cutoff_phrase = prompt
+    index = text.find(cutoff_phrase)
+    if index != -1:
+        return text[:index]
+    else:
+        return text
+
+def remove_substring(string, substring):
+    return string.replace(substring, "")
+
+
+def parse_text(text):
+    wrapped_text = textwrap.fill(text, width=100)
+    print(wrapped_text +'\n\n')
+    # return assistant_text
diff --git a/requirements.txt b/requirements.txt
index 9090990..0804b35 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,12 @@ langchain # 0.0.348
 tiktoken
 faiss-cpu
 
+# llama:
+torch # 2.1.0+cu121
+transformers # 4.35.2
+#accelerate # 0.25.0
+# torchtext # 0.16.0
+
 
 
 pytest

From 5fa7c8ca1117642ff2dff6de37a03b67ac8e47a8 Mon Sep 17 00:00:00 2001
From: MJ Rossetti <s2t2@users.noreply.github.com>
Date: Mon, 1 Jan 2024 12:13:50 -0500
Subject: [PATCH 2/5] Get organized, move all the things

---
 README.md                                            | 12 ++++++++++--
 ...document_formatting.py => document_decorators.py} |  0
 app/{llama_chain.py => meta_llama/chain.py}          |  0
 app/{llama_llm.py => meta_llama/llm.py}              |  0
 app/{llama_prompts.py => meta_llama/prompts.py}      |  0
 app/{ => openai}/document_processor.py               |  0
 app/{openai_llm.py => openai/llm.py}                 |  0
 app/{ => openai}/rows_processor.py                   |  0
 app/{ => openai}/starter_doc_processor.py            |  0
 app/{ => openai}/submissions_grader.py               |  0
 app/{ => openai}/submissions_processor.py            |  0
 app/{ => openai}/submissions_retriever.py            |  0
 app/{response_formatters.py => response_models.py}   |  0
 requirements.txt                                     |  2 +-
 14 files changed, 11 insertions(+), 3 deletions(-)
 rename app/{document_formatting.py => document_decorators.py} (100%)
 rename app/{llama_chain.py => meta_llama/chain.py} (100%)
 rename app/{llama_llm.py => meta_llama/llm.py} (100%)
 rename app/{llama_prompts.py => meta_llama/prompts.py} (100%)
 rename app/{ => openai}/document_processor.py (100%)
 rename app/{openai_llm.py => openai/llm.py} (100%)
 rename app/{ => openai}/rows_processor.py (100%)
 rename app/{ => openai}/starter_doc_processor.py (100%)
 rename app/{ => openai}/submissions_grader.py (100%)
 rename app/{ => openai}/submissions_processor.py (100%)
 rename app/{ => openai}/submissions_retriever.py (100%)
 rename app/{response_formatters.py => response_models.py} (100%)

diff --git a/README.md b/README.md
index d146a84..e355424 100644
--- a/README.md
+++ b/README.md
@@ -66,10 +66,11 @@ Create ".env" file and set environment variables:
 ```sh
 # this is the ".env" file...
 
+# choose one based on your preferred llm provider:
 OPENAI_API_KEY="sk-..."
-# or:
-HUGGINGFACE_TOKEN="..."
+HUGGINGFACE_TOKEN="hf_..."
 
+# for grading a particular homework:
 SUBMISSIONS_DIRPATH="/Users/USERNAME/Desktop/GRADING HW 4"
 STARTER_FILENAME="Homework_X_STARTER.ipynb"
 FILE_ID_SPLIT_INDEX="0" # 0 for files from Canvas, 1 for files from Blackboard
@@ -121,6 +122,8 @@ DOCS_LIMIT=5 python -m app.submissions_retriever
 
 ### Retreival Augmented Generation (RAG)
 
+#### OpenAI LLM
+
 Chat with the LLM:
 
 ```sh
@@ -133,6 +136,11 @@ DOCS_LIMIT=5 python -m app.submissions_grader
 # DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_grader
 ```
 
+#### Llama 2 LLM
+
+```sh
+TEMP=0.6 python -m app.meta_llm
+```
 
 ## Testing
 
diff --git a/app/document_formatting.py b/app/document_decorators.py
similarity index 100%
rename from app/document_formatting.py
rename to app/document_decorators.py
diff --git a/app/llama_chain.py b/app/meta_llama/chain.py
similarity index 100%
rename from app/llama_chain.py
rename to app/meta_llama/chain.py
diff --git a/app/llama_llm.py b/app/meta_llama/llm.py
similarity index 100%
rename from app/llama_llm.py
rename to app/meta_llama/llm.py
diff --git a/app/llama_prompts.py b/app/meta_llama/prompts.py
similarity index 100%
rename from app/llama_prompts.py
rename to app/meta_llama/prompts.py
diff --git a/app/document_processor.py b/app/openai/document_processor.py
similarity index 100%
rename from app/document_processor.py
rename to app/openai/document_processor.py
diff --git a/app/openai_llm.py b/app/openai/llm.py
similarity index 100%
rename from app/openai_llm.py
rename to app/openai/llm.py
diff --git a/app/rows_processor.py b/app/openai/rows_processor.py
similarity index 100%
rename from app/rows_processor.py
rename to app/openai/rows_processor.py
diff --git a/app/starter_doc_processor.py b/app/openai/starter_doc_processor.py
similarity index 100%
rename from app/starter_doc_processor.py
rename to app/openai/starter_doc_processor.py
diff --git a/app/submissions_grader.py b/app/openai/submissions_grader.py
similarity index 100%
rename from app/submissions_grader.py
rename to app/openai/submissions_grader.py
diff --git a/app/submissions_processor.py b/app/openai/submissions_processor.py
similarity index 100%
rename from app/submissions_processor.py
rename to app/openai/submissions_processor.py
diff --git a/app/submissions_retriever.py b/app/openai/submissions_retriever.py
similarity index 100%
rename from app/submissions_retriever.py
rename to app/openai/submissions_retriever.py
diff --git a/app/response_formatters.py b/app/response_models.py
similarity index 100%
rename from app/response_formatters.py
rename to app/response_models.py
diff --git a/requirements.txt b/requirements.txt
index 0804b35..2d00a26 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ tiktoken
 faiss-cpu
 
 # llama:
-torch # 2.1.0+cu121
+torch # 2.1.0+cu121 (for colab)
 transformers # 4.35.2
 #accelerate # 0.25.0
 # torchtext # 0.16.0

From c151a10fd1f8f50d5a19624baa79d1f1a828aff7 Mon Sep 17 00:00:00 2001
From: MJ Rossetti <s2t2@users.noreply.github.com>
Date: Mon, 1 Jan 2024 14:21:14 -0500
Subject: [PATCH 3/5] WIP - oh its too slow? need gpu?

---
 README.md                           |  34 +++---
 app/{meta_llama => meta}/chain.py   |  33 +++---
 app/meta/llm.py                     | 163 ++++++++++++++++++++++++++++
 app/{meta_llama => meta}/prompts.py |   0
 app/meta_llama/llm.py               |  60 ----------
 requirements.txt                    |   2 +-
 6 files changed, 199 insertions(+), 93 deletions(-)
 rename app/{meta_llama => meta}/chain.py (65%)
 create mode 100644 app/meta/llm.py
 rename app/{meta_llama => meta}/prompts.py (100%)
 delete mode 100644 app/meta_llama/llm.py

diff --git a/README.md b/README.md
index e355424..1dc0138 100644
--- a/README.md
+++ b/README.md
@@ -79,12 +79,31 @@ FILE_ID_SPLIT_INDEX="0" # 0 for files from Canvas, 1 for files from Blackboard
 
 ## Usage
 
+### Submission Files Manager
+
 Demonstrate ability to access submission files:
 
 ```sh
 python -m app.submissions_manager
 ```
 
+### LLM
+
+Demonstrate ability to query your LLM of choice (OpenAI or Meta Llama).
+
+Query the OpenAI LLM:
+
+```sh
+TEMP=0.6 python -m app.openai.llm
+```
+
+Query the Meta Llama LLM:
+
+```sh
+TEMP=0.6 python -m app.meta.llm
+```
+> NOTE: the first time the LLama model is run, it will take a while to download.
+
 ### Cell-based Document Splitting
 
 Process the starter file:
@@ -122,25 +141,14 @@ DOCS_LIMIT=5 python -m app.submissions_retriever
 
 ### Retreival Augmented Generation (RAG)
 
-#### OpenAI LLM
-
-Chat with the LLM:
+Use an LLM for grading:
 
 ```sh
-TEMP=0.6 python -m app.openai_llm
-```
-
-```sh
-DOCS_LIMIT=5 python -m app.submissions_grader
+DOCS_LIMIT=5 python -m app.openai.submissions_grader
 
 # DOCS_LIMIT=5 SIMILARITY_THRESHOLD=0.75 CHUNK_SIZE=1000 CHUNK_OVERLAP=0 python -m app.submissions_grader
 ```
 
-#### Llama 2 LLM
-
-```sh
-TEMP=0.6 python -m app.meta_llm
-```
 
 ## Testing
 
diff --git a/app/meta_llama/chain.py b/app/meta/chain.py
similarity index 65%
rename from app/meta_llama/chain.py
rename to app/meta/chain.py
index ab5b2fa..d7c73c2 100644
--- a/app/meta_llama/chain.py
+++ b/app/meta/chain.py
@@ -11,31 +11,26 @@
 from app.llama_llm import LlamaService
 
 
-load_dotenv()
-
-TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}
-
-
 if __name__ == "__main__":
 
     service = LlamaService()
-    pipeline = service.pipeline
-    llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={"temperature":TEMP})
-    print(llm)
+    #pipeline = service.pipeline
+    #llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={"temperature":TEMP})
+    #print(llm)
 
     # SIMPLE LLM CHAIN
 
-    system_prompt = "You are an advanced assistant that excels at translation. "
-    instruction = "Convert the following text from English to French:\n\n {text}"
-    template = get_prompt(instruction, system_prompt)
-    print(template)
-    prompt = PromptTemplate(template=template, input_variables=["text"])
-
-    llm_chain = LLMChain(prompt=prompt, llm=llm)
-
-    query = "how are you today?"
-    response = llm_chain.run(query)
-    parse_text(response)
+    #system_prompt = "You are an advanced assistant that excels at translation. "
+    #instruction = "Convert the following text from English to French:\n\n {text}"
+    #template = get_prompt(instruction, system_prompt)
+    #print(template)
+    #prompt = PromptTemplate(template=template, input_variables=["text"])
+#
+    #llm_chain = LLMChain(prompt=prompt, llm=llm)
+#
+    #query = "how are you today?"
+    #response = llm_chain.run(query)
+    #parse_text(response)
 
 
     # CHAT CHAIN
diff --git a/app/meta/llm.py b/app/meta/llm.py
new file mode 100644
index 0000000..82aec31
--- /dev/null
+++ b/app/meta/llm.py
@@ -0,0 +1,163 @@
+
+# adapted from youtube video about llama and langchain: ________________
+
+import os
+from dotenv import load_dotenv
+
+import torch
+#import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from langchain.prompts import PromptTemplate
+from langchain.chains import  LLMChain
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+
+from app.meta.prompts import get_prompt, parse_text, cut_off_text, remove_substring
+
+load_dotenv()
+
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf" # os.getenv("MODEL_NAME", default="meta-llama/Llama-2-7b-chat-hf")
+
+#MAX_NEW_TOKENS = 512
+TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}
+
+# THIS IS THE OFFICIAL SYSTEM PROMPT?
+INST, INST_END = "[INST]", "[/INST]"
+SYS, SYS_END = "<<SYS>>\n", "\n<</SYS>>\n\n"
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+"""
+
+def compile_prompt(prompt, system_prompt=DEFAULT_SYSTEM_PROMPT, input_variables=[]) -> PromptTemplate:
+    """Wraps your query in syntax the model understands. Uses default system instructions, or ones you provide.
+
+        Params:
+            prompt (str) : your prompt string, optionally with placeholder {} for input vars
+
+            input variables: a list of string input variable names in your prompt, default is None
+
+        Returns: langchain.PromptTemplate
+    """
+    formatted_prompt = f"{INST} {SYS} {system_prompt} {SYS_END} {prompt} {INST_END}"
+    return PromptTemplate(template=formatted_prompt, input_variables=input_variables)
+
+
+class HuggingFaceService:
+    def __init__(self, model_name=MODEL_NAME, temp=TEMP, token=HUGGINGFACE_TOKEN): # device_type="cpu",
+        self.model_name = model_name
+        self.token = token # hugging face api token
+        self.temp = temp
+
+        #self.device_type = device_type # "cpu" for local dev, or "cuda" for colab gpu
+
+    @property
+    def tokenizer(self):
+        # https://huggingface.co/transformers/v2.11.0/model_doc/auto.html?highlight=autotokenizer#autotokenizer
+        return AutoTokenizer.from_pretrained(self.model_name, token=self.token) # cache_dir=CACHE_DIRPATH
+
+    @property
+    def model(self):
+        # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM
+        return AutoModelForCausalLM.from_pretrained(self.model_name, token=self.token,
+                                                    device_map="auto",
+                                                    #torch_dtype=torch.float16, # GPU ONLY?  https://stackoverflow.com/a/73530618/670433
+                                                    torch_dtype=torch.float32 # CPU
+        )
+
+    @property
+    def pipeline(self):
+        """wrapper for tokenizer and model, for performing the 'text-generation' task"""
+        # https://huggingface.co/docs/transformers/main_classes/pipelines
+        return pipeline(task="text-generation", model=self.model, tokenizer=self.tokenizer,
+                        device_map="auto",
+                        max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        #torch_dtype=torch.bfloat16, # GPU ONLY? https://stackoverflow.com/a/73530618/670433
+                        torch_dtype=torch.float32, # CPU
+        )
+
+    @property
+    def llm(self):
+        return HuggingFacePipeline(pipeline=self.pipeline, model_kwargs={"temperature":self.temp})
+
+
+    #def predict(self, query):
+
+
+    #def formatted_response(self, prompt, system_prompt=DEFAULT_SYSTEM_PROMPT, input_variables=None):
+    #    prompt = self.compile_prompt(prompt)
+    #
+    #    llm_chain = LLMChain(prompt=prompt, llm=llm)
+    #    response = llm_chain.run(query)
+    #    parse_text(response)
+
+    #def generate(self, text):
+    #    prompt = get_prompt(text)
+    #
+    #    with torch.autocast(self.device_type, dtype=torch.bfloat16):
+    #        #inputs = self.tokenizer(prompt, return_tensors="pt").to('cuda') # on CPU as well?
+    #        inputs = self.tokenizer(prompt, return_tensors="pt") #
+    #        breakpoint()
+    #        #if self.device_type == "cuda":
+    #        #    inputs = inputs.to("cuda")
+    #
+    #        outputs = self.model.generate(**inputs,
+    #                                max_new_tokens=512,
+    #                                eos_token_id=self.tokenizer.eos_token_id,
+    #                                pad_token_id=self.tokenizer.eos_token_id,
+    #        )
+    #        final_outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+    #        final_outputs = cut_off_text(final_outputs, '</s>')
+    #        final_outputs = remove_substring(final_outputs, prompt)
+    #
+    #    return final_outputs#, outputs
+
+
+
+
+
+if __name__ == "__main__":
+
+    hf = HuggingFaceService()
+
+    llm = hf.llm
+    print(llm)
+
+    general_knowlege_queries = [
+        "What year was America founded?",
+        "Tell us about the first humans who landed on the moon."
+    ]
+
+    for query in general_knowlege_queries:
+        # response = llm.predict(query).strip()
+        prompt = compile_prompt(prompt=query)
+        llm_chain = LLMChain(prompt=prompt, llm=llm)
+        #response = llm_chain.run(query) # chain({'foo': 1, 'bar': 2})
+        #> ValueError: A single string input was passed in, but this chain expects multiple inputs (set()). When a chain expects multiple inputs, please call it by passing in a dictionary, eg `chain({'foo': 1, 'bar': 2})`
+        response = llm_chain({"query": query}) # ooh it's slow?
+        parse_text(response)
+
+
+    breakpoint()
+    exit()
+
+    # PROMPT
+
+    system_prompt = "You are an advanced assistant that excels at translation. "
+    instruction = "Convert the following text from English to French:\n\n {text}"
+    prompt = compile_prompt(prompt=instruction, system_prompt=system_prompt, input_variables=["text"])
+    print(template)
+
+    # CHAIN
+
+    llm_chain = LLMChain(prompt=prompt, llm=llm)
+
+    query = "how are you today?"
+    while query != "":
+        print(query)
+        response = llm_chain.run(query)
+        parse_text(response)
+        print("------")
+        query = input("Query (or press enter to stop): ")
diff --git a/app/meta_llama/prompts.py b/app/meta/prompts.py
similarity index 100%
rename from app/meta_llama/prompts.py
rename to app/meta/prompts.py
diff --git a/app/meta_llama/llm.py b/app/meta_llama/llm.py
deleted file mode 100644
index bc8c5d2..0000000
--- a/app/meta_llama/llm.py
+++ /dev/null
@@ -1,60 +0,0 @@
-
-# adapted from youtube video about llama and langchain: ________________
-
-import os
-from dotenv import load_dotenv
-
-import torch
-import transformers
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-
-from app.llama_prompts import get_prompt, cut_off_text, remove_substring
-
-load_dotenv()
-
-HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
-
-MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
-
-
-class LlamaService:
-    def __init__(self, model_name=MODEL_NAME, hf_token=HUGGINGFACE_TOKEN):
-        self.model_name = model_name
-        self.hf_token = hf_token
-
-    @property
-    def tokenizer(self):
-        # https://huggingface.co/transformers/v2.11.0/model_doc/auto.html?highlight=autotokenizer#autotokenizer
-        return AutoTokenizer.from_pretrained(self.model_name, token=self.hf_token)
-
-    @property
-    def model(self):
-        # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM
-        return AutoModelForCausalLM.from_pretrained(self.model_name, token=self.hf_token,
-                                                    device_map='auto', torch_dtype=torch.float16,
-        )
-
-    @property
-    def pipeline(self):
-        # https://huggingface.co/docs/transformers/main_classes/pipelines
-        return pipeline(task="text-generation", model=self.model, tokenizer= self.tokenizer,
-                        device_map="auto", torch_dtype=torch.bfloat16,
-                        max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
-                        eos_token_id=self.tokenizer.eos_token_id
-        )
-
-
-    def generate(self, text):
-        prompt = get_prompt(text)
-        with torch.autocast('cuda', dtype=torch.bfloat16):
-            inputs = self.tokenizer(prompt, return_tensors="pt").to('cuda')
-            outputs = self.model.generate(**inputs,
-                                    max_new_tokens=512,
-                                    eos_token_id=self.tokenizer.eos_token_id,
-                                    pad_token_id=self.tokenizer.eos_token_id,
-            )
-            final_outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-            final_outputs = cut_off_text(final_outputs, '</s>')
-            final_outputs = remove_substring(final_outputs, prompt)
-
-        return final_outputs#, outputs
diff --git a/requirements.txt b/requirements.txt
index 2d00a26..736ab9f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,7 @@ faiss-cpu
 # llama:
 torch # 2.1.0+cu121 (for colab)
 transformers # 4.35.2
-#accelerate # 0.25.0
+accelerate # 0.25.0
 # torchtext # 0.16.0
 
 

From 38683ebd880d5bbfc04429a2e8d2025aab434208 Mon Sep 17 00:00:00 2001
From: MJ Rossetti <s2t2@users.noreply.github.com>
Date: Mon, 1 Jan 2024 14:51:39 -0500
Subject: [PATCH 4/5] Too slow on CPU

---
 app/meta/llm.py  | 30 +++++++++++++++++++++++++-----
 requirements.txt |  2 +-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/app/meta/llm.py b/app/meta/llm.py
index 82aec31..3fd940c 100644
--- a/app/meta/llm.py
+++ b/app/meta/llm.py
@@ -1,8 +1,13 @@
 
 # adapted from youtube video about llama and langchain: ________________
 
+# this is so slow on CPU though...
+# https://stackoverflow.com/a/77022488/670433
+
+
 import os
 from dotenv import load_dotenv
+import textwrap
 
 import torch
 #import transformers
@@ -11,7 +16,7 @@
 from langchain.chains import  LLMChain
 from langchain.llms.huggingface_pipeline import HuggingFacePipeline
 
-from app.meta.prompts import get_prompt, parse_text, cut_off_text, remove_substring
+#from app.meta.prompts import get_prompt, parse_text, cut_off_text, remove_substring
 
 load_dotenv()
 
@@ -44,13 +49,20 @@ def compile_prompt(prompt, system_prompt=DEFAULT_SYSTEM_PROMPT, input_variables=
     return PromptTemplate(template=formatted_prompt, input_variables=input_variables)
 
 
+
+
 class HuggingFaceService:
-    def __init__(self, model_name=MODEL_NAME, temp=TEMP, token=HUGGINGFACE_TOKEN): # device_type="cpu",
+    def __init__(self, model_name=MODEL_NAME, temp=TEMP, token=HUGGINGFACE_TOKEN, device_type="cpu"):
         self.model_name = model_name
         self.token = token # hugging face api token
         self.temp = temp
 
-        #self.device_type = device_type # "cpu" for local dev, or "cuda" for colab gpu
+        self.device_type = device_type # "cpu" for local dev, or "cuda" for colab gpu
+
+        # https://stackoverflow.com/a/73530618/670433
+        # https://huggingface.co/openlm-research/open_llama_7b_v2/discussions/2
+        # https://pytorch.org/docs/stable/tensors.html
+        self.torch_dtype = torch.float32 if self.device_type == "cpu" else torch.float16
 
     @property
     def tokenizer(self):
@@ -63,7 +75,8 @@ def model(self):
         return AutoModelForCausalLM.from_pretrained(self.model_name, token=self.token,
                                                     device_map="auto",
                                                     #torch_dtype=torch.float16, # GPU ONLY?  https://stackoverflow.com/a/73530618/670433
-                                                    torch_dtype=torch.float32 # CPU
+                                                    #torch_dtype=torch.float32 # CPU
+                                                    torch_dtype=self.torch_dtype
         )
 
     @property
@@ -75,7 +88,8 @@ def pipeline(self):
                         max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
                         eos_token_id=self.tokenizer.eos_token_id,
                         #torch_dtype=torch.bfloat16, # GPU ONLY? https://stackoverflow.com/a/73530618/670433
-                        torch_dtype=torch.float32, # CPU
+                        #torch_dtype=torch.float32, # CPU
+                        torch_dtype=self.torch_dtype
         )
 
     @property
@@ -116,6 +130,12 @@ def llm(self):
 
 
 
+def parse_text(text):
+    wrapped_text = textwrap.fill(text, width=100)
+    print(wrapped_text +'\n\n')
+    # return assistant_text
+
+
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 736ab9f..51bb70b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ plotly
 
 
 openai # 1.3.8
-langchain # 0.0.348
+langchain # 0.0.348 ... 0.0.353
 tiktoken
 faiss-cpu
 

From fdbc57420a2b3bfeeee1d89cea4455ccd1579f3c Mon Sep 17 00:00:00 2001
From: MJ Rossetti <s2t2@users.noreply.github.com>
Date: Sun, 14 Jan 2024 15:24:45 -0500
Subject: [PATCH 5/5] Auto detect gpu

---
 app/meta/llm.py | 54 ++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/app/meta/llm.py b/app/meta/llm.py
index 3fd940c..763876c 100644
--- a/app/meta/llm.py
+++ b/app/meta/llm.py
@@ -8,6 +8,7 @@
 import os
 from dotenv import load_dotenv
 import textwrap
+from random import choice
 
 import torch
 #import transformers
@@ -26,6 +27,9 @@
 #MAX_NEW_TOKENS = 512
 TEMP = float(os.getenv("TEMP", default="0.0")) # @param {type:"slider", min:0, max:1, step:0.1}
 
+
+
+
 # THIS IS THE OFFICIAL SYSTEM PROMPT?
 INST, INST_END = "[INST]", "[/INST]"
 SYS, SYS_END = "<<SYS>>\n", "\n<</SYS>>\n\n"
@@ -52,17 +56,16 @@ def compile_prompt(prompt, system_prompt=DEFAULT_SYSTEM_PROMPT, input_variables=
 
 
 class HuggingFaceService:
-    def __init__(self, model_name=MODEL_NAME, temp=TEMP, token=HUGGINGFACE_TOKEN, device_type="cpu"):
+    def __init__(self, model_name=MODEL_NAME, temp=TEMP, token=HUGGINGFACE_TOKEN):
         self.model_name = model_name
         self.token = token # hugging face api token
         self.temp = temp
 
-        self.device_type = device_type # "cpu" for local dev, or "cuda" for colab gpu
-
+        self.device_type = "cuda" if torch.cuda.is_available() else "cpu"
         # https://stackoverflow.com/a/73530618/670433
         # https://huggingface.co/openlm-research/open_llama_7b_v2/discussions/2
         # https://pytorch.org/docs/stable/tensors.html
-        self.torch_dtype = torch.float32 if self.device_type == "cpu" else torch.float16
+        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
     @property
     def tokenizer(self):
@@ -72,29 +75,27 @@ def tokenizer(self):
     @property
     def model(self):
         # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM
-        return AutoModelForCausalLM.from_pretrained(self.model_name, token=self.token,
-                                                    device_map="auto",
-                                                    #torch_dtype=torch.float16, # GPU ONLY?  https://stackoverflow.com/a/73530618/670433
-                                                    #torch_dtype=torch.float32 # CPU
-                                                    torch_dtype=self.torch_dtype
+        return AutoModelForCausalLM.from_pretrained(
+            self.model_name, token=self.token, device_map="auto", torch_dtype=self.torch_dtype
         )
 
     @property
     def pipeline(self):
         """wrapper for tokenizer and model, for performing the 'text-generation' task"""
         # https://huggingface.co/docs/transformers/main_classes/pipelines
-        return pipeline(task="text-generation", model=self.model, tokenizer=self.tokenizer,
-                        device_map="auto",
-                        max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
-                        eos_token_id=self.tokenizer.eos_token_id,
-                        #torch_dtype=torch.bfloat16, # GPU ONLY? https://stackoverflow.com/a/73530618/670433
-                        #torch_dtype=torch.float32, # CPU
-                        torch_dtype=self.torch_dtype
+        return pipeline(
+            task="text-generation", model=self.model, tokenizer=self.tokenizer,
+            device_map="auto", torch_dtype=self.torch_dtype, # torch.bfloat16
+            max_new_tokens=512, do_sample=True, top_k=30, num_return_sequences=1,
+            eos_token_id=self.tokenizer.eos_token_id,
         )
 
     @property
     def llm(self):
-        return HuggingFacePipeline(pipeline=self.pipeline, model_kwargs={"temperature":self.temp})
+        return HuggingFacePipeline(
+            #model_id=self.model_name, # this one is getting set to "gpt2" by default?
+            pipeline=self.pipeline, model_kwargs={"temperature":self.temp}
+        )
 
 
     #def predict(self, query):
@@ -150,14 +151,17 @@ def parse_text(text):
         "Tell us about the first humans who landed on the moon."
     ]
 
-    for query in general_knowlege_queries:
-        # response = llm.predict(query).strip()
-        prompt = compile_prompt(prompt=query)
-        llm_chain = LLMChain(prompt=prompt, llm=llm)
-        #response = llm_chain.run(query) # chain({'foo': 1, 'bar': 2})
-        #> ValueError: A single string input was passed in, but this chain expects multiple inputs (set()). When a chain expects multiple inputs, please call it by passing in a dictionary, eg `chain({'foo': 1, 'bar': 2})`
-        response = llm_chain({"query": query}) # ooh it's slow?
-        parse_text(response)
+    query = input("Please provide a Query (or press enter): ")
+    query = query or choice(general_knowlege_queries)
+    print(query)
+
+    # response = llm.predict(query).strip()
+    prompt = compile_prompt(prompt=query)
+    llm_chain = LLMChain(prompt=prompt, llm=llm)
+    #response = llm_chain.run(query) # chain({'foo': 1, 'bar': 2})
+    #> ValueError: A single string input was passed in, but this chain expects multiple inputs (set()). When a chain expects multiple inputs, please call it by passing in a dictionary, eg `chain({'foo': 1, 'bar': 2})`
+    response = llm_chain({"query": query}) # ooh it's slow?
+    parse_text(response)
 
 
     breakpoint()