Skip to content

Commit

Permalink
Merge pull request #31 from scripturecentralqa/knowhys_loader
Browse files Browse the repository at this point in the history
added knowhy loader notebook
  • Loading branch information
DallanQ authored Oct 21, 2023
2 parents 42c3210 + a96bf0d commit cb5ec56
Show file tree
Hide file tree
Showing 11 changed files with 1,423 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ __pycache__/
/notebooks/wandb/
.idea/
.venv/
.vscode/
*.log
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none"
}
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ Install dependencies using poetry: `poetry install`

Install nox: `poetry run pipx install nox && poetry run pipx inject nox nox-poetry`

Install pre-commit as a git hook: `poetry run pre-commit install`

Install spacy model: `poetry run python -m spacy download en_core_web_sm`

Create a `.env` file with the following variables:
Expand Down
15 changes: 6 additions & 9 deletions models/load_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from markdownify import MarkdownConverter # type: ignore
from tqdm import tqdm

from models.load_utils import clean


class ConferenceMarkdownConverter(MarkdownConverter): # type: ignore
"""Create a custom MarkdownConverter."""
Expand Down Expand Up @@ -43,11 +45,6 @@ def _to_markdown(html: str, **options: Any) -> str:
return cast(str, ConferenceMarkdownConverter(**options).convert(html))


def _clean(text: str) -> str:
"""Replace non-breaking space with normal space."""
return text.replace(" ", " ")


def load_conference_talk(url: str, html: str, bs_parser: str = "html.parser") -> Document:
"""Load a conference talk from a url and html."""
path_components = urlparse(url).path.split("/")
Expand All @@ -57,14 +54,14 @@ def load_conference_talk(url: str, html: str, bs_parser: str = "html.parser") ->
author = soup.select_one("article p.author-name")
author_role = soup.select_one("article p.author-role")
body = soup.select_one("article div.body-block")
content = _clean(_to_markdown(str(body), base_url=url)) if body else ""
content = clean(_to_markdown(str(body), base_url=url)) if body else ""
metadata = {
"year": year,
"month": month,
"url": url,
"title": _clean(title.text) if title else "",
"author": _clean(author.text) if author else "",
"author_role": _clean(author_role.text) if author_role else "",
"title": clean(title.text) if title else "",
"author": clean(author.text) if author else "",
"author_role": clean(author_role.text) if author_role else "",
}
return Document(page_content=content, metadata=metadata)

Expand Down
74 changes: 74 additions & 0 deletions models/load_know.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Load conference talks."""

import json
import os
from typing import Any
from typing import Iterator
from typing import cast

from bs4 import BeautifulSoup # type: ignore
from langchain.document_loaders.base import BaseLoader
from langchain.schema.document import Document
from markdownify import MarkdownConverter # type: ignore
from tqdm import tqdm

from models.load_utils import clean


# Create shorthand method for custom conversion
def _to_markdown(html: str, **options: Any) -> str:
"""Convert html to markdown."""
return cast(str, MarkdownConverter(**options).convert(html))


def load_knowhy(url: str, html: str, bs_parser: str = "html.parser") -> Document:
"""Load a conference talk from a url and html."""
soup = BeautifulSoup(html, bs_parser)
title = soup.find("h1", class_="page-title").text
author = soup.find("div", class_="field-nam-author").text.replace("Post contributed by", "")
date = soup.find("div", class_="field-name-publish-date").text
citation = soup.find(id="block-views-knowhy-citation-block")
body = soup.find("div", class_="group-left")
content = clean(_to_markdown(str(body), base_url=url)) if body else ""

metadata = {
"url": url,
"title": clean(title) if title else "",
"author": clean(author) if author else "",
"date": clean(date) if date else "",
"citation": clean(_to_markdown(str(citation), base_url=url)) if citation else "",
}
return Document(page_content=content, metadata=metadata)


class KnowhyLoader(BaseLoader):
"""Loader for General Conference Talks."""

def lazy_load(self) -> Iterator[Document]:
"""A lazy loader for Documents."""
raise NotImplementedError(f"{self.__class__.__name__} does not implement lazy_load()")

def __init__(self, path: str = "", bs_parser: str = "html.parser"):
"""Initialize loader."""
super().__init__()
self.path = path
self.bs_parser = bs_parser

def load(self, verbose: bool = False) -> list[Document]:
"""Load documents from path."""
docs = []
for filename in tqdm(os.listdir(self.path), disable=not verbose):
path = os.path.join(self.path, filename)
with open(path, encoding="utf8") as f:
data = json.load(f)
print(data)
doc = load_knowhy(data["url"], data["html"], bs_parser=self.bs_parser)
if not doc.metadata["title"] or not doc.page_content:
if verbose:
print("Missing title or content - skipping", filename)
continue
if not doc.metadata["author"]:
if verbose:
print("Missing author", filename)
docs.append(doc)
return docs
5 changes: 5 additions & 0 deletions models/load_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from langchain.schema.document import Document


def clean(text: str) -> str:
"""Replace non-breaking space with normal space and remove surrounding whitespace."""
return text.replace(" ", " ").strip()


def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
"""Save documents to jsonl file."""
with open(file_path, "w") as jsonl_file:
Expand Down
98 changes: 98 additions & 0 deletions notebooks/15_knowhy_loader.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "936d02dd",
"metadata": {},
"source": [
"# Load Talks\n",
"\n",
"Convert talk content from raw HTML to markdown format and extract key information. Write talks in JSONL format."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fe5bf12",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bca89a2",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"import os\n",
"\n",
"from models.load_know import KnowhyLoader\n",
"from models.load_utils import save_docs_to_jsonl"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e753397e",
"metadata": {},
"outputs": [],
"source": [
"# config\n",
"input_dir = '../data/load/raw/knowhys/'\n",
"output_dir = '../data/load/output/knowhys/'\n",
"\n",
"today = datetime.today().strftime('%Y-%m-%d')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f5ebfa4",
"metadata": {},
"outputs": [],
"source": [
"loader = KnowhyLoader(input_dir)\n",
"docs = loader.load(verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7949104b-e0e2-42eb-aa9f-6140722a1d1b",
"metadata": {},
"outputs": [],
"source": [
"output_filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
"\n",
"save_docs_to_jsonl(docs, output_filename)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "models",
"language": "python",
"name": "models"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 2 additions & 2 deletions notebooks/35_conference_indexer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"source": [
"# configure\n",
"split_path = \"../data/split/output/2023-09-24.jsonl\"\n",
"index_name = \"conf-ada-002\"\n",
"index_name = \"scqa\"\n",
"batch_size = 100\n",
"text_field = \"text\"\n",
"embedding_model, embedding_len, embedding_metric = (\"text-embedding-ada-002\", 1536, \"cosine\")"
Expand Down Expand Up @@ -83,7 +83,7 @@
"outputs": [],
"source": [
"pinecone.init(\n",
" api_key=os.environ['PINECONE_API_KEY'], \n",
" api_key=os.environ['PINECONE_API_KEY'],\n",
" environment=os.environ['PINECONE_ENV'],\n",
")\n",
"\n",
Expand Down
Loading

0 comments on commit cb5ec56

Please sign in to comment.