From ea84bcd59d2df5de6f2ddef079dfa0dbeee44f5c Mon Sep 17 00:00:00 2001 From: Anishere Mariam Adeola <37239247+AnishereMariam@users.noreply.github.com> Date: Tue, 22 Oct 2024 23:04:07 +0100 Subject: [PATCH 1/2] Documentation branch (#463) * fix Content block error for 'raw' directive * fix 'mdinclude' error on notes * fix indentation error on changelog.md * add npshinx to conf.py extension * render jupyter notebook on documentation * erase empty line from rendered notebook * fix undefined label error in contribution page * Hide notebook outputs and remove kernel --------- Co-authored-by: Andrew Tavis McAllister --- CHANGELOG.md | 6 +- docs/source/_static/CONTRIBUTING.rst | 2 +- docs/source/conf.py | 3 + docs/source/notes.rst | 4 +- .../wikipedia/gen_autosuggestions.rst | 7 +- .../scribe_data/wikipedia/notebook.ipynb | 308 ++++++++++++++++++ requirements.txt | 1 + src/scribe_data/unicode/process_unicode.py | 1 + src/scribe_data/unicode/unicode_utils.py | 1 + src/scribe_data/utils.py | 1 + src/scribe_data/wikidata/check_query/check.py | 1 + src/scribe_data/wikidata/check_query/query.py | 1 + .../wikidata/check_query/sparql.py | 1 + src/scribe_data/wikidata/query_data.py | 1 + src/scribe_data/wikipedia/extract_wiki.py | 1 + .../wikipedia/gen_autosuggestions.ipynb | 4 +- src/scribe_data/wikipedia/process_wiki.py | 1 + 17 files changed, 333 insertions(+), 11 deletions(-) create mode 100644 docs/source/scribe_data/wikipedia/notebook.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index ea1905b32..53f95a3db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,9 +16,9 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). - Scribe-Data is now a fully functional CLI. - Querying Wikidata lexicographical data can be done via the `--query` command ([#159](https://github.com/scribe-org/Scribe-Data/issues/159)). - - The output type of queries can be in JSON, CSV, TSV and SQLite, with conversions output types also being possible ([#145](https://github.com/scribe-org/Scribe-Data/issues/145), [#146](https://github.com/scribe-org/Scribe-Data/issues/146)) - - Output paths can be set for query results ([#144](https://github.com/scribe-org/Scribe-Data/issues/144)). - - The version of the CLI can be printed to the command line and the CLI can further be used to upgrade itself ([#186](https://github.com/scribe-org/Scribe-Data/issues/186), [#157 ](https://github.com/scribe-org/Scribe-Data/issues/157)). + - The output type of queries can be in JSON, CSV, TSV and SQLite, with conversions output types also being possible ([#145](https://github.com/scribe-org/Scribe-Data/issues/145), [#146](https://github.com/scribe-org/Scribe-Data/issues/146)) + - Output paths can be set for query results ([#144](https://github.com/scribe-org/Scribe-Data/issues/144)). + - The version of the CLI can be printed to the command line and the CLI can further be used to upgrade itself ([#186](https://github.com/scribe-org/Scribe-Data/issues/186), [#157 ](https://github.com/scribe-org/Scribe-Data/issues/157)). - Total Wikidata lexemes for languages and data types can be derived with the `--total` command ([#147](https://github.com/scribe-org/Scribe-Data/issues/147)). - Commands can be used via an interactive mode with the `--interactive` command ([#158](https://github.com/scribe-org/Scribe-Data/issues/158)). - Articles are removed from machine translations so they're more directly useful in Scribe applications ([#96](https://github.com/scribe-org/Scribe-Data/issues/96)). diff --git a/docs/source/_static/CONTRIBUTING.rst b/docs/source/_static/CONTRIBUTING.rst index 4a34e1ffc..8cade2b22 100644 --- a/docs/source/_static/CONTRIBUTING.rst +++ b/docs/source/_static/CONTRIBUTING.rst @@ -16,7 +16,7 @@ Contents - `First steps as a contributor <#first-steps-as-a-contributor>`__ - `Learning the tech stack <#learning-the-tech-stack>`__ - `Development environment <#development-environment>`__ -- `Issues and projects <#issues-projects>`__ +- `Issues and projects <#issues-and-projects>`__ - `Bug reports <#bug-reports>`__ - `Feature requests <#feature-requests>`__ - `Pull requests <#pull-requests>`__ diff --git a/docs/source/conf.py b/docs/source/conf.py index 60dbb3922..78613691c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -40,8 +40,11 @@ "numpydoc", "sphinx.ext.viewcode", "sphinx.ext.imgmath", + "nbsphinx", ] +nbsphinx_allow_errors = True +nbsphinx_execute = "never" numpydoc_show_inherited_class_members = False numpydoc_show_class_members = False diff --git a/docs/source/notes.rst b/docs/source/notes.rst index 9d9aa20d0..f72b2f2a8 100644 --- a/docs/source/notes.rst +++ b/docs/source/notes.rst @@ -1,4 +1,4 @@ -.. mdinclude:: _static/CONTRIBUTING.rst +.. include:: _static/CONTRIBUTING.rst License ======= @@ -6,4 +6,4 @@ License .. literalinclude:: ../../LICENSE.txt :language: text -.. mdinclude:: ../../CHANGELOG.md +.. include:: ../../CHANGELOG.md diff --git a/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst b/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst index 5f4c90b00..e69334a18 100644 --- a/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst +++ b/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst @@ -3,9 +3,10 @@ gen_autosuggestions.ipynb `View code on Github `_ -Scribe Autosuggest Generation ------------------------------ - This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps. +.. toctree:: + + notebook.ipynb + Use the :code:`View code on GitHub` link above to view the notebook and explore the process! diff --git a/docs/source/scribe_data/wikipedia/notebook.ipynb b/docs/source/scribe_data/wikipedia/notebook.ipynb new file mode 100644 index 000000000..cdff0eb23 --- /dev/null +++ b/docs/source/scribe_data/wikipedia/notebook.ipynb @@ -0,0 +1,308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "743abe55", + "metadata": { + "toc": true + }, + "source": [ + "

Table of Contents

\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "592e4b36", + "metadata": {}, + "source": [ + "**Scribe Autosuggest Generation**\n", + "\n", + "This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec5ff38", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c8c7a44", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:52:39.142528Z", + "start_time": "2023-04-10T19:52:39.087499Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "from tqdm.auto import tqdm\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14a5bf58", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:52:39.147871Z", + "start_time": "2023-04-10T19:52:39.144127Z" + } + }, + "outputs": [], + "source": [ + "pwd = os.path.dirname(os.path.realpath(\"gen_autosuggestions.ipynb\"))\n", + "pwd = pwd.split(\"scribe_data\")[0]\n", + "sys.path.append(pwd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7939bd", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:52:52.508933Z", + "start_time": "2023-04-10T19:52:52.506137Z" + } + }, + "outputs": [], + "source": [ + "from scribe_data.wikipedia.extract_wiki import (\n", + " download_wiki,\n", + " parse_to_ndjson,\n", + ")\n", + "from scribe_data.wikipedia.process_wiki import (\n", + " clean,\n", + " gen_autosuggestions,\n", + ")\n", + "from scribe_data.utils import get_language_iso" + ] + }, + { + "cell_type": "markdown", + "id": "2add942e", + "metadata": {}, + "source": [ + "# Download and Parse Wiki" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a722df43", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:53:16.467643Z", + "start_time": "2023-04-10T19:53:16.464619Z" + } + }, + "outputs": [], + "source": [ + "# Languages: French, German, Italian, Portuguese, Russian, Spanish, Swedish\n", + "language = \"French\"\n", + "language_abbr = get_language_iso(language)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11546a55", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:53:28.138818Z", + "start_time": "2023-04-10T19:53:17.184354Z" + } + }, + "outputs": [], + "source": [ + "files = download_wiki(\n", + " language=language,\n", + " target_dir=f\"./{language_abbr}wiki_dump\",\n", + " file_limit=None, # None is all files\n", + " dump_id=\"20220920\"\n", + ")\n", + "print(f\"Number of files: {len(files)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b40fd9d9", + "metadata": { + "ExecuteTime": { + "end_time": "2022-10-03T12:25:23.192390Z", + "start_time": "2022-10-03T12:25:23.189124Z" + } + }, + "outputs": [], + "source": [ + "parse_to_ndjson(\n", + " output_path=f\"./{language_abbr}wiki.ndjson\",\n", + " input_dir=f\"./{language_abbr}wiki_dump\",\n", + " partitions_dir=f\"./{language_abbr}wiki_partitions\",\n", + " article_limit=None, # None is all articles\n", + " delete_parsed_files=True,\n", + " multicore=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3c3f2f51", + "metadata": {}, + "source": [ + "# Process and Clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "205a01b4", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:27.126Z" + } + }, + "outputs": [], + "source": [ + "with open(f\"./{language_abbr}wiki.ndjson\", \"r\") as fin:\n", + " article_texts = [\n", + " json.loads(lang)[1] for lang in tqdm(fin, desc=\"Articles added\", unit=\"articles\")\n", + " ]\n", + "\n", + "print(f\"Number of articles: {len(article_texts)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1b869f4", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:34.201Z" + } + }, + "outputs": [], + "source": [ + "# Define sample size for up to 1 million articles.\n", + "sample_size = 1000000 / len(article_texts)\n", + "sample_size = min(sample_size, 1)\n", + "sample_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea9ea16c", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:40.574Z" + } + }, + "outputs": [], + "source": [ + "text_corpus = clean(\n", + " texts=article_texts,\n", + " language=language,\n", + " remove_words=None,\n", + " sample_size=sample_size,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "593e855d", + "metadata": {}, + "source": [ + "# Generate and Upload" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cda9e874", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:54.735Z" + } + }, + "outputs": [], + "source": [ + "autosuggest_dict = gen_autosuggestions(\n", + " text_corpus,\n", + " language=language,\n", + " num_words=1000,\n", + " ignore_words=None,\n", + " update_local_data=True,\n", + " verbose=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8c385b7", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:55.451Z" + } + }, + "outputs": [], + "source": [ + "# autosuggest_dict" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": true, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/requirements.txt b/requirements.txt index abbd5e443..03ed90a90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ flax>=0.8.2 iso639-lang>=2.2.3 m2r2>=0.3.3 mwparserfromhell>=0.6 +nbsphinx>=0.9.5 numpydoc>=1.6.0 packaging>=20.9 pandas>=1.5.3 diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 223a40fec..21f550e5f 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -2,6 +2,7 @@ Module for processing Unicode based corpuses for autosuggestion and autocompletion generation. .. raw:: html +