Merge pull request #31 from scripturecentralqa/knowhys_loader

added knowhy loader notebook
iloveconference · Oct 21, 2023 · cb5ec56 · cb5ec56
2 parents 42c3210 + a96bf0d
commit cb5ec56
Show file tree

Hide file tree

Showing 11 changed files with 1,423 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ __pycache__/
 /notebooks/wandb/
 .idea/
 .venv/
+.vscode/
 *.log
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "[python]": {
+    "editor.defaultFormatter": "ms-python.black-formatter"
+  },
+  "python.formatting.provider": "none"
+}
diff --git a/README.md b/README.md
@@ -27,6 +27,8 @@ Install dependencies using poetry: `poetry install`
 
 Install nox: `poetry run pipx install nox && poetry run pipx inject nox nox-poetry`
 
+Install pre-commit as a git hook: `poetry run pre-commit install`
+
 Install spacy model: `poetry run python -m spacy download en_core_web_sm`
 
 Create a `.env` file with the following variables:

diff --git a/models/load_conf.py b/models/load_conf.py
@@ -14,6 +14,8 @@
 from markdownify import MarkdownConverter  # type: ignore
 from tqdm import tqdm
 
+from models.load_utils import clean
+
 
 class ConferenceMarkdownConverter(MarkdownConverter):  # type: ignore
     """Create a custom MarkdownConverter."""
@@ -43,11 +45,6 @@ def _to_markdown(html: str, **options: Any) -> str:
     return cast(str, ConferenceMarkdownConverter(**options).convert(html))
 
 
-def _clean(text: str) -> str:
-    """Replace non-breaking space with normal space."""
-    return text.replace(" ", " ")
-
-
 def load_conference_talk(url: str, html: str, bs_parser: str = "html.parser") -> Document:
     """Load a conference talk from a url and html."""
     path_components = urlparse(url).path.split("/")
@@ -57,14 +54,14 @@ def load_conference_talk(url: str, html: str, bs_parser: str = "html.parser") ->
     author = soup.select_one("article p.author-name")
     author_role = soup.select_one("article p.author-role")
     body = soup.select_one("article div.body-block")
-    content = _clean(_to_markdown(str(body), base_url=url)) if body else ""
+    content = clean(_to_markdown(str(body), base_url=url)) if body else ""
     metadata = {
         "year": year,
         "month": month,
         "url": url,
-        "title": _clean(title.text) if title else "",
-        "author": _clean(author.text) if author else "",
-        "author_role": _clean(author_role.text) if author_role else "",
+        "title": clean(title.text) if title else "",
+        "author": clean(author.text) if author else "",
+        "author_role": clean(author_role.text) if author_role else "",
     }
     return Document(page_content=content, metadata=metadata)
 

diff --git a/models/load_know.py b/models/load_know.py
@@ -0,0 +1,74 @@
+"""Load conference talks."""
+
+import json
+import os
+from typing import Any
+from typing import Iterator
+from typing import cast
+
+from bs4 import BeautifulSoup  # type: ignore
+from langchain.document_loaders.base import BaseLoader
+from langchain.schema.document import Document
+from markdownify import MarkdownConverter  # type: ignore
+from tqdm import tqdm
+
+from models.load_utils import clean
+
+
+# Create shorthand method for custom conversion
+def _to_markdown(html: str, **options: Any) -> str:
+    """Convert html to markdown."""
+    return cast(str, MarkdownConverter(**options).convert(html))
+
+
+def load_knowhy(url: str, html: str, bs_parser: str = "html.parser") -> Document:
+    """Load a conference talk from a url and html."""
+    soup = BeautifulSoup(html, bs_parser)
+    title = soup.find("h1", class_="page-title").text
+    author = soup.find("div", class_="field-nam-author").text.replace("Post contributed by", "")
+    date = soup.find("div", class_="field-name-publish-date").text
+    citation = soup.find(id="block-views-knowhy-citation-block")
+    body = soup.find("div", class_="group-left")
+    content = clean(_to_markdown(str(body), base_url=url)) if body else ""
+
+    metadata = {
+        "url": url,
+        "title": clean(title) if title else "",
+        "author": clean(author) if author else "",
+        "date": clean(date) if date else "",
+        "citation": clean(_to_markdown(str(citation), base_url=url)) if citation else "",
+    }
+    return Document(page_content=content, metadata=metadata)
+
+
+class KnowhyLoader(BaseLoader):
+    """Loader for General Conference Talks."""
+
+    def lazy_load(self) -> Iterator[Document]:
+        """A lazy loader for Documents."""
+        raise NotImplementedError(f"{self.__class__.__name__} does not implement lazy_load()")
+
+    def __init__(self, path: str = "", bs_parser: str = "html.parser"):
+        """Initialize loader."""
+        super().__init__()
+        self.path = path
+        self.bs_parser = bs_parser
+
+    def load(self, verbose: bool = False) -> list[Document]:
+        """Load documents from path."""
+        docs = []
+        for filename in tqdm(os.listdir(self.path), disable=not verbose):
+            path = os.path.join(self.path, filename)
+            with open(path, encoding="utf8") as f:
+                data = json.load(f)
+                print(data)
+            doc = load_knowhy(data["url"], data["html"], bs_parser=self.bs_parser)
+            if not doc.metadata["title"] or not doc.page_content:
+                if verbose:
+                    print("Missing title or content - skipping", filename)
+                continue
+            if not doc.metadata["author"]:
+                if verbose:
+                    print("Missing author", filename)
+            docs.append(doc)
+        return docs
diff --git a/models/load_utils.py b/models/load_utils.py
@@ -6,6 +6,11 @@
 from langchain.schema.document import Document
 
 
+def clean(text: str) -> str:
+    """Replace non-breaking space with normal space and remove surrounding whitespace."""
+    return text.replace(" ", " ").strip()
+
+
 def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
     """Save documents to jsonl file."""
     with open(file_path, "w") as jsonl_file:

diff --git a/notebooks/15_knowhy_loader.ipynb b/notebooks/15_knowhy_loader.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "936d02dd",
+   "metadata": {},
+   "source": [
+    "# Load Talks\n",
+    "\n",
+    "Convert talk content from raw HTML to markdown format and extract key information. Write talks in JSONL format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fe5bf12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bca89a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "from models.load_know import KnowhyLoader\n",
+    "from models.load_utils import save_docs_to_jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e753397e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "input_dir = '../data/load/raw/knowhys/'\n",
+    "output_dir = '../data/load/output/knowhys/'\n",
+    "\n",
+    "today = datetime.today().strftime('%Y-%m-%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f5ebfa4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = KnowhyLoader(input_dir)\n",
+    "docs = loader.load(verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7949104b-e0e2-42eb-aa9f-6140722a1d1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
+    "\n",
+    "save_docs_to_jsonl(docs, output_filename)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "models",
+   "language": "python",
+   "name": "models"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/35_conference_indexer.ipynb b/notebooks/35_conference_indexer.ipynb
@@ -44,7 +44,7 @@
    "source": [
     "# configure\n",
     "split_path = \"../data/split/output/2023-09-24.jsonl\"\n",
-    "index_name = \"conf-ada-002\"\n",
+    "index_name = \"scqa\"\n",
     "batch_size = 100\n",
     "text_field = \"text\"\n",
     "embedding_model, embedding_len, embedding_metric = (\"text-embedding-ada-002\", 1536, \"cosine\")"
@@ -83,7 +83,7 @@
    "outputs": [],
    "source": [
     "pinecone.init(\n",
-    "    api_key=os.environ['PINECONE_API_KEY'], \n",
+    "    api_key=os.environ['PINECONE_API_KEY'],\n",
     "    environment=os.environ['PINECONE_ENV'],\n",
     ")\n",
     "\n",
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,4 +16,5 @@ __pycache__/ @@
     /notebooks/wandb/
     .idea/
     .venv/
+    .vscode/
     *.log