Skip to content

Commit

Permalink
publishing v1.0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
daboncanplay committed Jul 12, 2023
1 parent 1c27aa2 commit a3b4301
Show file tree
Hide file tree
Showing 12 changed files with 32 additions and 51 deletions.
1 change: 1 addition & 0 deletions Includes/Classroom-Setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

DA.paths.working_dir = DA.paths.to_vm_path(DA.paths.working_dir)
DA.paths.datasets = DA.paths.to_vm_path(DA.paths.datasets)
DA.paths.user_db = DA.paths.to_vm_path(DA.paths.user_db)

# COMMAND ----------

Expand Down
4 changes: 2 additions & 2 deletions Includes/Test-Framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def dbTestQuestion1_2(translation_pipeline, translation_results, translation_inp
lesson, question = "lesson1", "question2"
userhome_for_testing = getUsernameFromEnv(lesson)

assert str(translation_pipeline.task) == "translation", "Test NOT passed: Pipeline should be built for task `translation`"
assert "translation" in str(translation_pipeline.task), "Test NOT passed: Pipeline should be built for task `translation`"
assert isinstance(translation_results, list), "Test NOT passed: Result should be a list."
assert len(translation_results) == len(translation_inputs), "Test NOT passed: Result should be a list of length equal to the input dataset size."
assert min([len(s) for s in translation_results]) > 0, "Test NOT passed: Translations should be non-empty."
Expand All @@ -100,7 +100,7 @@ def dbTestQuestion2_1(collection_name):
lesson, question = "lesson2", "question1"
userhome_for_testing = getUsernameFromEnv(lesson)

assert collection_name is not None, "Test NOT passed: The collection name should not be empty."
assert collection_name=="my_talks", "Test NOT passed: The collection_name should be my_talks."

questionPassed(userhome_for_testing, lesson, question)

Expand Down
15 changes: 0 additions & 15 deletions Includes/Workspace-Setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,21 +130,6 @@

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Create Class-Shared Databricks SQL Warehouse/Endpoint
# MAGIC Creates a single wharehouse to be used by all students.
# MAGIC
# MAGIC The configuration is derived from the number of students specified above.

# COMMAND ----------

from dbacademy.dbhelper.warehouses_helper_class import WarehousesHelper

DA.workspace.warehouses.create_shared_sql_warehouse(name=WarehousesHelper.WAREHOUSES_DEFAULT_NAME)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Configure User Entitlements
Expand Down
2 changes: 1 addition & 1 deletion Includes/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __install_libraries():
if specified_version != version:
print("** Dependency Version Overridden *******************************************************************")
print(f"* This course was built for {specified_version} of the DBAcademy Library, but it is being overridden via the Spark")
print(f"* configuration variable \"{key}\". The use of version v3.0.82 is not advised as we")
print(f"* configuration variable \"{key}\". The use of version v3.0.84 is not advised as we")
print(f"* cannot guarantee compatibility with this version of the course.")
print("****************************************************************************************************")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@
# MAGIC
# MAGIC We will use the [Helsinki-NLP/tatoeba_mt](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt) dataset. It includes sentence pairs from many languages, but we will focus on translating Japanese to English.
# MAGIC
# MAGIC If you feel stuck, please refer to the hints at the end of this section.
# MAGIC Hints in case you feel stuck on this task:
# MAGIC * Some models can handle *a lot* of languages. Check out [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb), the No Language Left Behind model ([research paper](https://arxiv.org/abs/2207.04672)).
# MAGIC * The "translation" task for `pipeline` takes optional parameters `src_lang` (source language) and `tgt_lang` (target language), which are important when the model can handle multiple languages. To figure out what codes to use to specify languages (and scripts for those languages), it can be helpful to find existing examples of using your model; for NLLB, check out [this Python script with codes](https://huggingface.co/spaces/Geonmo/nllb-translation-demo/blob/main/flores200_codes.py) or similar demo resources.
# MAGIC

# COMMAND ----------

Expand Down Expand Up @@ -159,13 +162,6 @@

# COMMAND ----------

# MAGIC %md If you feel stuck on the above Japanese -> English translation task, here are some hints:
# MAGIC * Some models can handle *a lot* of languages. Check out [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb), the No Language Left Behind model ([research paper](https://arxiv.org/abs/2207.04672)).
# MAGIC * The "translation" task for `pipeline` takes optional parameters `src_lang` (source language) and `tgt_lang` (target language), which are important when the model can handle multiple languages. To figure out what codes to use to specify languages (and scripts for those languages), it can be helpful to find existing examples of using your model; for NLLB, check out [this Python script with codes](https://huggingface.co/spaces/Geonmo/nllb-translation-demo/blob/main/flores200_codes.py) or similar demo resources.
# MAGIC

# COMMAND ----------

# MAGIC %md ### Question 3: Few-shot learning
# MAGIC
# MAGIC In this section, you will build a prompt which gets an LLM to answer a few-shot learning problem. Your prompt will have 3 sections:
Expand All @@ -176,7 +172,7 @@
# MAGIC
# MAGIC Your goal is to make the LLM answer the new query, with as good a response as possible.
# MAGIC
# MAGIC More specifically, your prompt should following this template:
# MAGIC More specifically, your prompt should follow this template:
# MAGIC ```
# MAGIC <High-level instruction about the task: Given input_label, generate output_label.>:
# MAGIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,12 @@ def search_content(query, pdf_to_index, k=3):

collection_name = "my_news"

# If you have created the collection before, you need delete the collection first
if len(chroma_client.list_collections()) > 0 and collection_name in [
chroma_client.list_collections()[0].name
]:
# If you have created the collection before, you need to delete the collection first
if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:
chroma_client.delete_collection(name=collection_name)
else:
print(f"Creating collection: '{collection_name}'")
collection = chroma_client.create_collection(name=collection_name)

print(f"Creating collection: '{collection_name}'")
collection = chroma_client.create_collection(name=collection_name)

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

# MAGIC %md
# MAGIC
# MAGIC Fill out `collection_name` below.
# MAGIC Assign the value of `my_talks` to the `collection_name` variable.

# COMMAND ----------

Expand All @@ -104,7 +104,7 @@
# MAGIC %md
# MAGIC ## Question 2
# MAGIC
# MAGIC Add data to collection
# MAGIC [Add](https://docs.trychroma.com/reference/Collection#add) data to the collection.

# COMMAND ----------

Expand All @@ -125,7 +125,7 @@
# MAGIC %md
# MAGIC ## Question 3
# MAGIC
# MAGIC Query for relevant documents
# MAGIC [Query](https://docs.trychroma.com/reference/Collection#query) for relevant documents. If you are looking for talks related to language models, your query texts could be `language models`.

# COMMAND ----------

Expand All @@ -150,7 +150,7 @@
# MAGIC %md
# MAGIC ## Question 4
# MAGIC
# MAGIC Load language model
# MAGIC Load a language model and create a [pipeline](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines).

# COMMAND ----------

Expand Down Expand Up @@ -183,7 +183,8 @@

# TODO
# Come up with a question that you need the LLM assistant to help you with
# A sample question is "Help me find sessions related to XYZ"
# A sample question is "Help me find sessions related to XYZ"
# Note: Your "XYZ" should be related to the query you passed in Question 3.
question = "<FILL_IN>"

# Provide all returned similar documents from the cell above below
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

# COMMAND ----------

# MAGIC %pip install pinecone-client==2.2.1
# MAGIC %pip install pinecone-client==2.2.2

# COMMAND ----------

Expand Down Expand Up @@ -236,8 +236,7 @@

@pandas_udf("array<float>")
def create_embeddings_with_transformers(
sentences: Iterator[pd.Series],
) -> Iterator[pd.Series]:
sentences: Iterator[pd.Series],) -> Iterator[pd.Series]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
for batch in sentences:
yield pd.Series(model.encode(batch).tolist())
Expand All @@ -249,11 +248,11 @@ def create_embeddings_with_transformers(
transformer_type = "sentence-transformers/all-MiniLM-L6-v2"
embedding_spark_df = (
df.limit(1000)
.withColumn("vector", create_embeddings_with_transformers("title"))
.withColumn("values", create_embeddings_with_transformers("title"))
.withColumn("namespace", F.lit(transformer_type))
.withColumn("metadata", F.to_json(F.struct(F.col("topic").alias("TOPIC"))))
# We select these columns because they are expected by the Spark-Pinecone connector
.select("id", "vector", "namespace", "metadata")
.select("id", "values", "namespace", "metadata")
)
display(embedding_spark_df)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,9 @@
# -----------------------------------
# 2 We connect an LLM for Hyde, (we could use a slightly more advanced model 'text-davinci-003 since we have some more logic in this prompt).

# hyde_llm=jekyll_llm
hyde_llm = jekyll_llm
# Uncomment the line below if you were to use OpenAI instead
hyde_llm = OpenAI(model="text-davinci-003")
# hyde_llm = OpenAI(model="text-davinci-003")

# -----------------------------------
# -----------------------------------
Expand Down Expand Up @@ -273,7 +273,7 @@
# MAGIC %md
# MAGIC ## `DaScie` - Our first vector database data science AI agent!
# MAGIC
# MAGIC In this section we're going to build an Agent based on the [ReAct paradigm](https://react-lm.github.io/) (or though-action-observation loop) that will take instructions in plain text and perform data science analysis on data that we've stored in a vector database. The agent type we'll use is using zero-shot learning, which takes in the prompt and leverages the underlying LLMs' zero-shot abilities.
# MAGIC In this section we're going to build an Agent based on the [ReAct paradigm](https://react-lm.github.io/) (or thought-action-observation loop) that will take instructions in plain text and perform data science analysis on data that we've stored in a vector database. The agent type we'll use is using zero-shot learning, which takes in the prompt and leverages the underlying LLMs' zero-shot abilities.

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@
# MAGIC - [`map_reduce`](https://docs.langchain.com/docs/components/chains/index_related_chains#map-reduce) - This method involves running an initial prompt on each chunk of data (for summarization tasks, this could be a summary of that chunk; for question-answering tasks, it could be an answer based solely on that chunk).
# MAGIC - [`refine`](https://docs.langchain.com/docs/components/chains/index_related_chains#refine) - This method involves running an initial prompt on the first chunk of data, generating some output. For the remaining documents, that output is passed in, along with the next document, asking the LLM to refine the output based on the new document.
# MAGIC - [`map_rerank`](https://docs.langchain.com/docs/components/chains/index_related_chains#map-rerank) - This method involves running an initial prompt on each chunk of data, that not only tries to complete a task but also gives a score for how certain it is in its answer. The responses are then ranked according to this score, and the highest score is returned.
# MAGIC * NOTE: For this exercise, `map_rerank` will [error](https://github.com/hwchase17/langchain/issues/3970).

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

# MAGIC %md ## Dataset
# MAGIC
# MAGIC We will use a subset of the `cnn_dailymail` dataset from See et al., 2017, downloadable from the Hugging Face `datasets` hub: https://huggingface.co/datasets/cnn_dailymail
# MAGIC We will use a subset of the `cnn_dailymail` dataset from See et al., 2017, downloadable from the [Hugging Face `datasets` hub](https://huggingface.co/datasets/cnn_dailymail).
# MAGIC
# MAGIC This dataset provides news article paired with summaries (in the "highlights" column). Let's load the data and take a look at some examples.

Expand Down Expand Up @@ -202,7 +202,7 @@ def perform_inference(batch: list) -> list:
# MAGIC
# MAGIC Now that we can generate summaries---and we know 0/1 accuracy is useless here---let's look at how we can compute a meaningful metric designed to evaluate summarization: ROUGE.
# MAGIC
# MAGIC Recall-Oriented Understudy for Gisting Evaluation (ROUGE) is a set of evaluation metrics designed for comparing summaries from Lin et al., 2004. See https://en.wikipedia.org/wiki/ROUGE_(metric) for more info. Here, we use the Hugging Face Evaluator wrapper to call into the `rouge_score` package. This package provides 4 scores:
# MAGIC Recall-Oriented Understudy for Gisting Evaluation (ROUGE) is a set of evaluation metrics designed for comparing summaries from Lin et al., 2004. See [Wikipedia](https://en.wikipedia.org/wiki/ROUGE_&#40;metric&#41;) for more info. Here, we use the Hugging Face Evaluator wrapper to call into the `rouge_score` package. This package provides 4 scores:
# MAGIC
# MAGIC * `rouge1`: ROUGE computed over unigrams (single words or tokens)
# MAGIC * `rouge2`: ROUGE computed over bigrams (pairs of consecutive words or tokens)
Expand Down
4 changes: 2 additions & 2 deletions Version Info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
# MAGIC # Project Information
# MAGIC
# MAGIC * Name: **Large Language Models**
# MAGIC * Version: **1.0.2**
# MAGIC * Built On: **Jun 8, 2023 at 00:52:11 UTC**
# MAGIC * Version: **1.0.3**
# MAGIC * Built On: **Jul 12, 2023 at 18:13:09 UTC**

# COMMAND ----------

Expand Down

0 comments on commit a3b4301

Please sign in to comment.