From a3b43017dce37be0072405f9900aeb1f3191e53e Mon Sep 17 00:00:00 2001 From: daboncanplay Date: Wed, 12 Jul 2023 18:16:02 +0000 Subject: [PATCH] publishing v1.0.3 --- Includes/Classroom-Setup.py | 1 + Includes/Test-Framework.py | 4 ++-- Includes/Workspace-Setup.py | 15 --------------- Includes/_common.py | 2 +- .../LLM 01L - LLMs with Hugging Face Lab.py | 14 +++++--------- ... - Embeddings, Vector Databases, and Search.py | 12 +++++------- ... - Embeddings, Vector Databases, and Search.py | 11 ++++++----- .../LLM 02a - Pinecone [OPTIONAL].py | 9 ++++----- .../LLM 03 - Building LLM Chains.py | 6 +++--- .../LLM 03L - Building LLM Chains Lab.py | 1 + .../LLM 04b - Evaluating LLMs.py | 4 ++-- Version Info.py | 4 ++-- 12 files changed, 32 insertions(+), 51 deletions(-) diff --git a/Includes/Classroom-Setup.py b/Includes/Classroom-Setup.py index 430ae91..7e69d1a 100644 --- a/Includes/Classroom-Setup.py +++ b/Includes/Classroom-Setup.py @@ -9,6 +9,7 @@ DA.paths.working_dir = DA.paths.to_vm_path(DA.paths.working_dir) DA.paths.datasets = DA.paths.to_vm_path(DA.paths.datasets) +DA.paths.user_db = DA.paths.to_vm_path(DA.paths.user_db) # COMMAND ---------- diff --git a/Includes/Test-Framework.py b/Includes/Test-Framework.py index f8cf18c..13e9872 100644 --- a/Includes/Test-Framework.py +++ b/Includes/Test-Framework.py @@ -73,7 +73,7 @@ def dbTestQuestion1_2(translation_pipeline, translation_results, translation_inp lesson, question = "lesson1", "question2" userhome_for_testing = getUsernameFromEnv(lesson) - assert str(translation_pipeline.task) == "translation", "Test NOT passed: Pipeline should be built for task `translation`" + assert "translation" in str(translation_pipeline.task), "Test NOT passed: Pipeline should be built for task `translation`" assert isinstance(translation_results, list), "Test NOT passed: Result should be a list." assert len(translation_results) == len(translation_inputs), "Test NOT passed: Result should be a list of length equal to the input dataset size." assert min([len(s) for s in translation_results]) > 0, "Test NOT passed: Translations should be non-empty." @@ -100,7 +100,7 @@ def dbTestQuestion2_1(collection_name): lesson, question = "lesson2", "question1" userhome_for_testing = getUsernameFromEnv(lesson) - assert collection_name is not None, "Test NOT passed: The collection name should not be empty." + assert collection_name=="my_talks", "Test NOT passed: The collection_name should be my_talks." questionPassed(userhome_for_testing, lesson, question) diff --git a/Includes/Workspace-Setup.py b/Includes/Workspace-Setup.py index e28d89a..1dc712a 100644 --- a/Includes/Workspace-Setup.py +++ b/Includes/Workspace-Setup.py @@ -130,21 +130,6 @@ # COMMAND ---------- -# MAGIC %md -# MAGIC -# MAGIC ## Create Class-Shared Databricks SQL Warehouse/Endpoint -# MAGIC Creates a single wharehouse to be used by all students. -# MAGIC -# MAGIC The configuration is derived from the number of students specified above. - -# COMMAND ---------- - -from dbacademy.dbhelper.warehouses_helper_class import WarehousesHelper - -DA.workspace.warehouses.create_shared_sql_warehouse(name=WarehousesHelper.WAREHOUSES_DEFAULT_NAME) - -# COMMAND ---------- - # MAGIC %md # MAGIC # MAGIC ## Configure User Entitlements diff --git a/Includes/_common.py b/Includes/_common.py index 5243d7d..3d5a0dc 100644 --- a/Includes/_common.py +++ b/Includes/_common.py @@ -21,7 +21,7 @@ def __install_libraries(): if specified_version != version: print("** Dependency Version Overridden *******************************************************************") print(f"* This course was built for {specified_version} of the DBAcademy Library, but it is being overridden via the Spark") - print(f"* configuration variable \"{key}\". The use of version v3.0.82 is not advised as we") + print(f"* configuration variable \"{key}\". The use of version v3.0.84 is not advised as we") print(f"* cannot guarantee compatibility with this version of the course.") print("****************************************************************************************************") diff --git a/LLM 01 - Applications with LLMs/LLM 01L - LLMs with Hugging Face Lab.py b/LLM 01 - Applications with LLMs/LLM 01L - LLMs with Hugging Face Lab.py index f514fe3..580d12f 100644 --- a/LLM 01 - Applications with LLMs/LLM 01L - LLMs with Hugging Face Lab.py +++ b/LLM 01 - Applications with LLMs/LLM 01L - LLMs with Hugging Face Lab.py @@ -108,7 +108,10 @@ # MAGIC # MAGIC We will use the [Helsinki-NLP/tatoeba_mt](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt) dataset. It includes sentence pairs from many languages, but we will focus on translating Japanese to English. # MAGIC -# MAGIC If you feel stuck, please refer to the hints at the end of this section. +# MAGIC Hints in case you feel stuck on this task: +# MAGIC * Some models can handle *a lot* of languages. Check out [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb), the No Language Left Behind model ([research paper](https://arxiv.org/abs/2207.04672)). +# MAGIC * The "translation" task for `pipeline` takes optional parameters `src_lang` (source language) and `tgt_lang` (target language), which are important when the model can handle multiple languages. To figure out what codes to use to specify languages (and scripts for those languages), it can be helpful to find existing examples of using your model; for NLLB, check out [this Python script with codes](https://huggingface.co/spaces/Geonmo/nllb-translation-demo/blob/main/flores200_codes.py) or similar demo resources. +# MAGIC # COMMAND ---------- @@ -159,13 +162,6 @@ # COMMAND ---------- -# MAGIC %md If you feel stuck on the above Japanese -> English translation task, here are some hints: -# MAGIC * Some models can handle *a lot* of languages. Check out [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb), the No Language Left Behind model ([research paper](https://arxiv.org/abs/2207.04672)). -# MAGIC * The "translation" task for `pipeline` takes optional parameters `src_lang` (source language) and `tgt_lang` (target language), which are important when the model can handle multiple languages. To figure out what codes to use to specify languages (and scripts for those languages), it can be helpful to find existing examples of using your model; for NLLB, check out [this Python script with codes](https://huggingface.co/spaces/Geonmo/nllb-translation-demo/blob/main/flores200_codes.py) or similar demo resources. -# MAGIC - -# COMMAND ---------- - # MAGIC %md ### Question 3: Few-shot learning # MAGIC # MAGIC In this section, you will build a prompt which gets an LLM to answer a few-shot learning problem. Your prompt will have 3 sections: @@ -176,7 +172,7 @@ # MAGIC # MAGIC Your goal is to make the LLM answer the new query, with as good a response as possible. # MAGIC -# MAGIC More specifically, your prompt should following this template: +# MAGIC More specifically, your prompt should follow this template: # MAGIC ``` # MAGIC : # MAGIC diff --git a/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02 - Embeddings, Vector Databases, and Search.py b/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02 - Embeddings, Vector Databases, and Search.py index 97d2e24..6ceff6e 100644 --- a/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02 - Embeddings, Vector Databases, and Search.py +++ b/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02 - Embeddings, Vector Databases, and Search.py @@ -204,14 +204,12 @@ def search_content(query, pdf_to_index, k=3): collection_name = "my_news" -# If you have created the collection before, you need delete the collection first -if len(chroma_client.list_collections()) > 0 and collection_name in [ - chroma_client.list_collections()[0].name -]: +# If you have created the collection before, you need to delete the collection first +if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]: chroma_client.delete_collection(name=collection_name) -else: - print(f"Creating collection: '{collection_name}'") - collection = chroma_client.create_collection(name=collection_name) + +print(f"Creating collection: '{collection_name}'") +collection = chroma_client.create_collection(name=collection_name) # COMMAND ---------- diff --git a/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02L - Embeddings, Vector Databases, and Search.py b/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02L - Embeddings, Vector Databases, and Search.py index 821611a..0ca4bc0 100644 --- a/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02L - Embeddings, Vector Databases, and Search.py +++ b/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02L - Embeddings, Vector Databases, and Search.py @@ -79,7 +79,7 @@ # MAGIC %md # MAGIC -# MAGIC Fill out `collection_name` below. +# MAGIC Assign the value of `my_talks` to the `collection_name` variable. # COMMAND ---------- @@ -104,7 +104,7 @@ # MAGIC %md # MAGIC ## Question 2 # MAGIC -# MAGIC Add data to collection +# MAGIC [Add](https://docs.trychroma.com/reference/Collection#add) data to the collection. # COMMAND ---------- @@ -125,7 +125,7 @@ # MAGIC %md # MAGIC ## Question 3 # MAGIC -# MAGIC Query for relevant documents +# MAGIC [Query](https://docs.trychroma.com/reference/Collection#query) for relevant documents. If you are looking for talks related to language models, your query texts could be `language models`. # COMMAND ---------- @@ -150,7 +150,7 @@ # MAGIC %md # MAGIC ## Question 4 # MAGIC -# MAGIC Load language model +# MAGIC Load a language model and create a [pipeline](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines). # COMMAND ---------- @@ -183,7 +183,8 @@ # TODO # Come up with a question that you need the LLM assistant to help you with -# A sample question is "Help me find sessions related to XYZ" +# A sample question is "Help me find sessions related to XYZ" +# Note: Your "XYZ" should be related to the query you passed in Question 3. question = "" # Provide all returned similar documents from the cell above below diff --git a/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02a - Pinecone [OPTIONAL].py b/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02a - Pinecone [OPTIONAL].py index 00e3102..4feb0c5 100644 --- a/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02a - Pinecone [OPTIONAL].py +++ b/LLM 02 - Embeddings, Vector Databases, and Search/LLM 02a - Pinecone [OPTIONAL].py @@ -27,7 +27,7 @@ # COMMAND ---------- -# MAGIC %pip install pinecone-client==2.2.1 +# MAGIC %pip install pinecone-client==2.2.2 # COMMAND ---------- @@ -236,8 +236,7 @@ @pandas_udf("array") def create_embeddings_with_transformers( - sentences: Iterator[pd.Series], -) -> Iterator[pd.Series]: + sentences: Iterator[pd.Series],) -> Iterator[pd.Series]: model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") for batch in sentences: yield pd.Series(model.encode(batch).tolist()) @@ -249,11 +248,11 @@ def create_embeddings_with_transformers( transformer_type = "sentence-transformers/all-MiniLM-L6-v2" embedding_spark_df = ( df.limit(1000) - .withColumn("vector", create_embeddings_with_transformers("title")) + .withColumn("values", create_embeddings_with_transformers("title")) .withColumn("namespace", F.lit(transformer_type)) .withColumn("metadata", F.to_json(F.struct(F.col("topic").alias("TOPIC")))) # We select these columns because they are expected by the Spark-Pinecone connector - .select("id", "vector", "namespace", "metadata") + .select("id", "values", "namespace", "metadata") ) display(embedding_spark_df) diff --git a/LLM 03 - Multi-stage Reasoning/LLM 03 - Building LLM Chains.py b/LLM 03 - Multi-stage Reasoning/LLM 03 - Building LLM Chains.py index a72ac65..95d0d7b 100644 --- a/LLM 03 - Multi-stage Reasoning/LLM 03 - Building LLM Chains.py +++ b/LLM 03 - Multi-stage Reasoning/LLM 03 - Building LLM Chains.py @@ -230,9 +230,9 @@ # ----------------------------------- # 2 We connect an LLM for Hyde, (we could use a slightly more advanced model 'text-davinci-003 since we have some more logic in this prompt). -# hyde_llm=jekyll_llm +hyde_llm = jekyll_llm # Uncomment the line below if you were to use OpenAI instead -hyde_llm = OpenAI(model="text-davinci-003") +# hyde_llm = OpenAI(model="text-davinci-003") # ----------------------------------- # ----------------------------------- @@ -273,7 +273,7 @@ # MAGIC %md # MAGIC ## `DaScie` - Our first vector database data science AI agent! # MAGIC -# MAGIC In this section we're going to build an Agent based on the [ReAct paradigm](https://react-lm.github.io/) (or though-action-observation loop) that will take instructions in plain text and perform data science analysis on data that we've stored in a vector database. The agent type we'll use is using zero-shot learning, which takes in the prompt and leverages the underlying LLMs' zero-shot abilities. +# MAGIC In this section we're going to build an Agent based on the [ReAct paradigm](https://react-lm.github.io/) (or thought-action-observation loop) that will take instructions in plain text and perform data science analysis on data that we've stored in a vector database. The agent type we'll use is using zero-shot learning, which takes in the prompt and leverages the underlying LLMs' zero-shot abilities. # COMMAND ---------- diff --git a/LLM 03 - Multi-stage Reasoning/LLM 03L - Building LLM Chains Lab.py b/LLM 03 - Multi-stage Reasoning/LLM 03L - Building LLM Chains Lab.py index 139ef52..37aa040 100644 --- a/LLM 03 - Multi-stage Reasoning/LLM 03L - Building LLM Chains Lab.py +++ b/LLM 03 - Multi-stage Reasoning/LLM 03L - Building LLM Chains Lab.py @@ -246,6 +246,7 @@ # MAGIC - [`map_reduce`](https://docs.langchain.com/docs/components/chains/index_related_chains#map-reduce) - This method involves running an initial prompt on each chunk of data (for summarization tasks, this could be a summary of that chunk; for question-answering tasks, it could be an answer based solely on that chunk). # MAGIC - [`refine`](https://docs.langchain.com/docs/components/chains/index_related_chains#refine) - This method involves running an initial prompt on the first chunk of data, generating some output. For the remaining documents, that output is passed in, along with the next document, asking the LLM to refine the output based on the new document. # MAGIC - [`map_rerank`](https://docs.langchain.com/docs/components/chains/index_related_chains#map-rerank) - This method involves running an initial prompt on each chunk of data, that not only tries to complete a task but also gives a score for how certain it is in its answer. The responses are then ranked according to this score, and the highest score is returned. +# MAGIC * NOTE: For this exercise, `map_rerank` will [error](https://github.com/hwchase17/langchain/issues/3970). # COMMAND ---------- diff --git a/LLM 04 - Fine-tuning and Evaluating LLMs/LLM 04b - Evaluating LLMs.py b/LLM 04 - Fine-tuning and Evaluating LLMs/LLM 04b - Evaluating LLMs.py index fb5aa64..5d579c9 100644 --- a/LLM 04 - Fine-tuning and Evaluating LLMs/LLM 04b - Evaluating LLMs.py +++ b/LLM 04 - Fine-tuning and Evaluating LLMs/LLM 04b - Evaluating LLMs.py @@ -43,7 +43,7 @@ # MAGIC %md ## Dataset # MAGIC -# MAGIC We will use a subset of the `cnn_dailymail` dataset from See et al., 2017, downloadable from the Hugging Face `datasets` hub: https://huggingface.co/datasets/cnn_dailymail +# MAGIC We will use a subset of the `cnn_dailymail` dataset from See et al., 2017, downloadable from the [Hugging Face `datasets` hub](https://huggingface.co/datasets/cnn_dailymail). # MAGIC # MAGIC This dataset provides news article paired with summaries (in the "highlights" column). Let's load the data and take a look at some examples. @@ -202,7 +202,7 @@ def perform_inference(batch: list) -> list: # MAGIC # MAGIC Now that we can generate summaries---and we know 0/1 accuracy is useless here---let's look at how we can compute a meaningful metric designed to evaluate summarization: ROUGE. # MAGIC -# MAGIC Recall-Oriented Understudy for Gisting Evaluation (ROUGE) is a set of evaluation metrics designed for comparing summaries from Lin et al., 2004. See https://en.wikipedia.org/wiki/ROUGE_(metric) for more info. Here, we use the Hugging Face Evaluator wrapper to call into the `rouge_score` package. This package provides 4 scores: +# MAGIC Recall-Oriented Understudy for Gisting Evaluation (ROUGE) is a set of evaluation metrics designed for comparing summaries from Lin et al., 2004. See [Wikipedia](https://en.wikipedia.org/wiki/ROUGE_(metric)) for more info. Here, we use the Hugging Face Evaluator wrapper to call into the `rouge_score` package. This package provides 4 scores: # MAGIC # MAGIC * `rouge1`: ROUGE computed over unigrams (single words or tokens) # MAGIC * `rouge2`: ROUGE computed over bigrams (pairs of consecutive words or tokens) diff --git a/Version Info.py b/Version Info.py index 0c281bb..06f2a5d 100644 --- a/Version Info.py +++ b/Version Info.py @@ -11,8 +11,8 @@ # MAGIC # Project Information # MAGIC # MAGIC * Name: **Large Language Models** -# MAGIC * Version: **1.0.2** -# MAGIC * Built On: **Jun 8, 2023 at 00:52:11 UTC** +# MAGIC * Version: **1.0.3** +# MAGIC * Built On: **Jul 12, 2023 at 18:13:09 UTC** # COMMAND ----------