diff --git a/docs/examples/Throughput_Comparison.ipynb b/docs/examples/Throughput_Comparison.ipynb new file mode 100644 index 00000000..5098cbd7 --- /dev/null +++ b/docs/examples/Throughput_Comparison.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🤗 Huggingface vs ⚡ FastEmbed️\n", + "\n", + "Comparing the performance of Huggingface's 🤗 Transformers and ⚡ FastEmbed️ on a simple task on the following machine: Apple M2 Max, 32 GB RAM\n", + "\n", + "## 📦 Imports\n", + "\n", + "Importing the necessary libraries for this comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from pathlib import Path\n", + "from typing import Any, Callable, List, Tuple\n", + "\n", + "import numpy as np\n", + "import torch.nn.functional as F\n", + "from fastembed.embedding import DefaultEmbedding\n", + "from torch import Tensor\n", + "from transformers import AutoModel, AutoTokenizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📖 Data\n", + "\n", + "data is a list of strings, each string is a document." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents: List[str] = [\n", + " \"Chandrayaan-3 is India's third lunar mission\",\n", + " \"It aimed to land a rover on the Moon's surface - joining the US, China and Russia\",\n", + " \"The mission is a follow-up to Chandrayaan-2, which had partial success\",\n", + " \"Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)\",\n", + " \"The estimated cost of the mission is around $35 million\",\n", + " \"It will carry instruments to study the lunar surface and atmosphere\",\n", + " \"Chandrayaan-3 landed on the Moon's surface on 23rd August 2023\",\n", + " \"It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.\",\n", + " \"The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit\",\n", + " \"The mission used GSLV Mk III rocket for its launch\",\n", + " \"Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota\",\n", + " \"Chandrayaan-3 was launched earlier in the year 2023\",\n", + "]\n", + "len(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting up 🤗 Huggingface\n", + "\n", + "We'll be using the [Huggingface Transformers](https://huggingface.co/transformers/) with PyTorch library to generate embeddings. We'll be using the same model across both libraries for a fair(er?) comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([12, 384])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class HF:\n", + " \"\"\"\n", + " HuggingFace Transformer implementation of FlagEmbedding\n", + " Based on https://huggingface.co/BAAI/bge-base-en\n", + " \"\"\"\n", + "\n", + " def __init__(self, model_id: str):\n", + " self.model = AutoModel.from_pretrained(model_id)\n", + " self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "\n", + " def embed(self, texts: List[str]):\n", + " encoded_input = self.tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors=\"pt\")\n", + " model_output = self.model(**encoded_input)\n", + " sentence_embeddings = model_output[0][:, 0]\n", + " sentence_embeddings = F.normalize(sentence_embeddings)\n", + " return sentence_embeddings\n", + "\n", + "hf = HF(model_id=\"BAAI/bge-small-en\")\n", + "hf.embed(documents).shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting up ⚡️FastEmbed\n", + "\n", + "Sorry, don't have a lot to set up here. We'll be using the default model, which is Flag Embedding, same as the Huggingface model." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_model = DefaultEmbedding()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📊 Comparison\n", + "\n", + "We'll be comparing the following metrics: Minimum, Maximum, Mean, across k runs. Let's write a function to do that:\n", + "\n", + "### 🚀 Calculating Stats" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_time_stats(embed_func: Callable, documents: list, k: int) -> Tuple[float, float, float]:\n", + " times = []\n", + " for _ in range(k):\n", + " # Timing the embed_func call\n", + " start_time = time.time()\n", + " embeddings = embed_func(documents)\n", + " end_time = time.time()\n", + "\n", + " times.append(end_time - start_time)\n", + "\n", + " # Returning mean, max, and min time for the call\n", + " return (sum(times) / k, max(times), min(times))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Huggingface Transformers (Average, Max, Min): (0.06037795543670654, 0.06067395210266113, 0.06008195877075195)\n", + "FastEmbed (Average, Max, Min): (0.03734695911407471, 0.03747200965881348, 0.03722190856933594)\n" + ] + } + ], + "source": [ + "hf_stats = calculate_time_stats(hf.embed, documents, k=2)\n", + "print(f\"Huggingface Transformers (Average, Max, Min): {hf_stats}\")\n", + "fst_stats = calculate_time_stats(embedding_model.embed, documents, k=2)\n", + "print(f\"FastEmbed (Average, Max, Min): {fst_stats}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📈 Results\n", + "\n", + "Let's run the comparison and see the results." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "\u001b[33mDEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install matplotlib --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def plot_character_per_second_comparison(\n", + " hf_stats: Tuple[float, float, float], fst_stats: Tuple[float, float, float], documents: list\n", + "):\n", + " # Calculating total characters in documents\n", + " total_characters = sum(len(doc) for doc in documents)\n", + "\n", + " # Calculating characters per second for each model\n", + " hf_chars_per_sec = total_characters / hf_stats[0] # Mean time is at index 0\n", + " fst_chars_per_sec = total_characters / fst_stats[0]\n", + "\n", + " # Plotting the bar chart\n", + " models = [\"HF Embed (Torch)\", \"FastEmbed\"]\n", + " chars_per_sec = [hf_chars_per_sec, fst_chars_per_sec]\n", + "\n", + " bars = plt.bar(models, chars_per_sec, color=[\"#1f356c\", \"#dd1f4b\"])\n", + " plt.ylabel(\"Characters per Second\")\n", + " plt.title(\"Characters Processed per Second Comparison\")\n", + "\n", + " # Adding the number at the top of each bar\n", + " for bar, chars in zip(bars, chars_per_sec):\n", + " plt.text(\n", + " bar.get_x() + bar.get_width() / 2,\n", + " bar.get_height(),\n", + " f\"{chars:.1f}\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " color=\"#1f356c\",\n", + " fontsize=12,\n", + " )\n", + "\n", + " plt.show()\n", + "\n", + "\n", + "plot_character_per_second_comparison(hf_stats, fst_stats, documents)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fst", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/fooling_around/02_HF_vs_FastEmbed.ipynb b/fooling_around/02_HF_vs_FastEmbed.ipynb deleted file mode 100644 index 7ca3cd9f..00000000 --- a/fooling_around/02_HF_vs_FastEmbed.ipynb +++ /dev/null @@ -1,309 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 🤗 Huggingface vs ⚡ FastEmbed️\n", - "\n", - "Comparing the performance of Huggingface's 🤗 Transformers and ⚡ FastEmbed️ on a simple task on the following machine: Apple M2 Max, 32 GB RAM\n", - "\n", - "## 📦 Imports\n", - "\n", - "Importing the necessary libraries for this comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "from pathlib import Path\n", - "from typing import Any, Callable, List, Tuple\n", - "\n", - "import numpy as np\n", - "import torch.nn.functional as F\n", - "from fastembed.embedding import DefaultEmbedding\n", - "from torch import Tensor\n", - "from transformers import AutoModel, AutoTokenizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📖 Data\n", - "\n", - "data is a list of strings, each string is a document." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "12" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "documents: List[str] = [\n", - " \"Chandrayaan-3 is India's third lunar mission\",\n", - " \"It aimed to land a rover on the Moon's surface - joining the US, China and Russia\",\n", - " \"The mission is a follow-up to Chandrayaan-2, which had partial success\",\n", - " \"Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)\",\n", - " \"The estimated cost of the mission is around $35 million\",\n", - " \"It will carry instruments to study the lunar surface and atmosphere\",\n", - " \"Chandrayaan-3 landed on the Moon's surface on 23rd August 2023\",\n", - " \"It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.\",\n", - " \"The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit\",\n", - " \"The mission used GSLV Mk III rocket for its launch\",\n", - " \"Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota\",\n", - " \"Chandrayaan-3 was launched earlier in the year 2023\",\n", - "]\n", - "len(documents)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up 🤗 Huggingface\n", - "\n", - "We'll be using the [Huggingface Transformers](https://huggingface.co/transformers/) with PyTorch library to generate embeddings. We'll be using the same model across both libraries for a fair(er?) comparison." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([12, 384])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "class HF:\n", - " \"\"\"\n", - " HuggingFace Transformer implementation of FlagEmbedding\n", - " Based on https://huggingface.co/BAAI/bge-base-en\n", - " \"\"\"\n", - "\n", - " def __init__(self, model_id: str):\n", - " self.model = AutoModel.from_pretrained(model_id)\n", - " self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n", - "\n", - " def embed(self, texts: List[str]):\n", - " encoded_input = self.tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors=\"pt\")\n", - " model_output = self.model(**encoded_input)\n", - " sentence_embeddings = model_output[0][:, 0]\n", - " sentence_embeddings = F.normalize(sentence_embeddings)\n", - " return sentence_embeddings\n", - "\n", - "hf = HF(model_id=\"BAAI/bge-small-en\")\n", - "hf.embed(documents).shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up ⚡️FastEmbed\n", - "\n", - "Sorry, don't have a lot to set up here. We'll be using the default model, which is Flag Embedding, same as the Huggingface model." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "embedding_model = DefaultEmbedding()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📊 Comparison\n", - "\n", - "We'll be comparing the following metrics: Minimum, Maximum, Mean, across k runs. Let's write a function to do that:\n", - "\n", - "### 🚀 Calculating Stats" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_time_stats(embed_func: Callable, documents: list, k: int) -> Tuple[float, float, float]:\n", - " times = []\n", - " for _ in range(k):\n", - " # Timing the embed_func call\n", - " start_time = time.time()\n", - " embeddings = embed_func(documents)\n", - " end_time = time.time()\n", - "\n", - " times.append(end_time - start_time)\n", - "\n", - " # Returning mean, max, and min time for the call\n", - " return (sum(times) / k, max(times), min(times))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Huggingface Transformers (Average, Max, Min): (0.06899809837341309, 0.07179117202758789, 0.06620502471923828)\n", - "FastEmbed (Average, Max, Min): (0.6830369234085083, 0.6874828338623047, 0.6785910129547119)\n" - ] - } - ], - "source": [ - "hf_stats = calculate_time_stats(hf.embed, documents, k=2)\n", - "print(f\"Huggingface Transformers (Average, Max, Min): {hf_stats}\")\n", - "fst_stats = calculate_time_stats(lambda x: list(embedding_model.embed(x)), documents, k=2)\n", - "print(f\"FastEmbed (Average, Max, Min): {fst_stats}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📈 Results\n", - "\n", - "Let's run the comparison and see the results." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "\u001b[33mDEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install matplotlib --quiet" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def plot_character_per_second_comparison(\n", - " hf_stats: Tuple[float, float, float], fst_stats: Tuple[float, float, float], documents: list\n", - "):\n", - " # Calculating total characters in documents\n", - " total_characters = sum(len(doc) for doc in documents)\n", - "\n", - " # Calculating characters per second for each model\n", - " hf_chars_per_sec = total_characters / hf_stats[0] # Mean time is at index 0\n", - " fst_chars_per_sec = total_characters / fst_stats[0]\n", - "\n", - " # Plotting the bar chart\n", - " models = [\"HF Embed (Torch)\", \"FastEmbed\"]\n", - " chars_per_sec = [hf_chars_per_sec, fst_chars_per_sec]\n", - "\n", - " bars = plt.bar(models, chars_per_sec, color=[\"#1f356c\", \"#dd1f4b\"])\n", - " plt.ylabel(\"Characters per Second\")\n", - " plt.title(\"Characters Processed per Second Comparison\")\n", - "\n", - " # Adding the number at the top of each bar\n", - " for bar, chars in zip(bars, chars_per_sec):\n", - " plt.text(\n", - " bar.get_x() + bar.get_width() / 2,\n", - " bar.get_height(),\n", - " f\"{chars:.1f}\",\n", - " ha=\"center\",\n", - " va=\"bottom\",\n", - " color=\"#1f356c\",\n", - " fontsize=12,\n", - " )\n", - "\n", - " plt.show()\n", - "\n", - "\n", - "plot_character_per_second_comparison(hf_stats, fst_stats, documents)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fst", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}