Skip to content

Commit

Permalink
Merge pull request #32 from scripturecentralqa/split_knowhys
Browse files Browse the repository at this point in the history
splitted the knowhys documents
  • Loading branch information
DallanQ authored Oct 21, 2023
2 parents cb5ec56 + be7f7bf commit 454003e
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 10 deletions.
8 changes: 4 additions & 4 deletions .safety-policy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ security: # configuration for the `safety check` command
ignore-vulnerabilities: # Here you can list multiple specific vulnerabilities you want to ignore (optionally for a time period)
# We recommend making use of the optional `reason` and `expires` keys for each vulnerability that you ignore.
60433: # Example vulnerability ID
reason: ignore until December, then get the latest version
expires: "2023-12-01" # datetime string - date this ignore will expire, best practice to use this variable
reason: ignore until 2024, then get the latest version
expires: "2024-01-01" # datetime string - date this ignore will expire, best practice to use this variable
59399: # Example vulnerability ID
reason: ignore unil December, then get the latest version
expires: "2023-12-01" # datetime string - date this ignore will expire, best practice to use this variable
reason: ignore until 2024, then get the latest version
expires: "2024-01-01" # datetime string - date this ignore will expire, best practice to use this variable
continue-on-vulnerability-error: False # Suppress non-zero exit codes when vulnerabilities are found. Enable this in pipelines and CI/CD processes if you want to pass builds that have vulnerabilities. We recommend you set this to False.
alert: # configuration for the `safety alert` command
security:
Expand Down
4 changes: 2 additions & 2 deletions models/load_know.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Load conference talks."""
"""Load knowhys."""

import json
import os
Expand All @@ -22,7 +22,7 @@ def _to_markdown(html: str, **options: Any) -> str:


def load_knowhy(url: str, html: str, bs_parser: str = "html.parser") -> Document:
"""Load a conference talk from a url and html."""
"""Load knowhys from a url and html."""
soup = BeautifulSoup(html, bs_parser)
title = soup.find("h1", class_="page-title").text
author = soup.find("div", class_="field-nam-author").text.replace("Post contributed by", "")
Expand Down
7 changes: 4 additions & 3 deletions notebooks/15_knowhy_loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
"outputs": [],
"source": [
"loader = KnowhyLoader(input_dir)\n",
"docs = loader.load(verbose=True)"
"docs = loader.load(verbose=True)\n",
"docs"
]
},
{
Expand All @@ -70,7 +71,7 @@
"source": [
"output_filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
"\n",
"save_docs_to_jsonl(docs, output_filename)"
"save_docs_to_jsonl(docs, output_filename)\n"
]
}
],
Expand All @@ -90,7 +91,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down
160 changes: 160 additions & 0 deletions notebooks/25_knowhy_splitter.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split loaded documents using trained splitter model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%load_ext dotenv\n",
"%dotenv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import os\n",
"\n",
"from IPython.display import display, Markdown\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter, Language\n",
"\n",
"from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl\n",
"from models.split_model import ModelTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# configure\n",
"input_path = '../data/load/output/knowhys/2023-10-20.jsonl'\n",
"# split_model_path = '../data/split/model/2023-09-24.pkl'\n",
"# split_threshold = 0.55\n",
"chunk_size = 2000\n",
"chunk_overlap = 200\n",
"anchor = \"anchor\"\n",
"output_dir = '../data/split/output/knowhys/'\n",
"today = datetime.today().strftime('%Y-%m-%d')\n",
"length_function = len"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docs = load_docs_from_jsonl(input_path)\n",
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create splits"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text_splitter = RecursiveCharacterTextSplitter.from_language(\n",
" Language.MARKDOWN,\n",
" chunk_size=chunk_size, \n",
" chunk_overlap=chunk_overlap,\n",
" length_function=length_function,\n",
")\n",
"split = text_splitter.split_documents(docs)\n",
"len(split)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"split[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save splits "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
"save_docs_to_jsonl(split, filename)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "models",
"language": "python",
"name": "models"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"mypy",
"tests",
"typeguard",
"xdoctest",
# "xdoctest",
"docs-build",
)

Expand Down

0 comments on commit 454003e

Please sign in to comment.