From ea84bcd59d2df5de6f2ddef079dfa0dbeee44f5c Mon Sep 17 00:00:00 2001
From: Anishere Mariam Adeola
 <37239247+AnishereMariam@users.noreply.github.com>
Date: Tue, 22 Oct 2024 23:04:07 +0100
Subject: [PATCH 1/2] Documentation branch (#463)

* fix Content block error for 'raw' directive

* fix 'mdinclude' error on notes

* fix indentation error on changelog.md

* add npshinx to conf.py extension

* render jupyter notebook on documentation

* erase empty line from rendered notebook

* fix undefined label error in contribution page

* Hide notebook outputs and remove kernel

---------

Co-authored-by: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
---
 CHANGELOG.md                                  |   6 +-
 docs/source/_static/CONTRIBUTING.rst          |   2 +-
 docs/source/conf.py                           |   3 +
 docs/source/notes.rst                         |   4 +-
 .../wikipedia/gen_autosuggestions.rst         |   7 +-
 .../scribe_data/wikipedia/notebook.ipynb      | 308 ++++++++++++++++++
 requirements.txt                              |   1 +
 src/scribe_data/unicode/process_unicode.py    |   1 +
 src/scribe_data/unicode/unicode_utils.py      |   1 +
 src/scribe_data/utils.py                      |   1 +
 src/scribe_data/wikidata/check_query/check.py |   1 +
 src/scribe_data/wikidata/check_query/query.py |   1 +
 .../wikidata/check_query/sparql.py            |   1 +
 src/scribe_data/wikidata/query_data.py        |   1 +
 src/scribe_data/wikipedia/extract_wiki.py     |   1 +
 .../wikipedia/gen_autosuggestions.ipynb       |   4 +-
 src/scribe_data/wikipedia/process_wiki.py     |   1 +
 17 files changed, 333 insertions(+), 11 deletions(-)
 create mode 100644 docs/source/scribe_data/wikipedia/notebook.ipynb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea1905b32..53f95a3db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,9 +16,9 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
 
 - Scribe-Data is now a fully functional CLI.
   - Querying Wikidata lexicographical data can be done via the `--query` command ([#159](https://github.com/scribe-org/Scribe-Data/issues/159)).
-    - The output type of queries can be in JSON, CSV, TSV and SQLite, with conversions output types also being possible ([#145](https://github.com/scribe-org/Scribe-Data/issues/145), [#146](https://github.com/scribe-org/Scribe-Data/issues/146))
-    - Output paths can be set for query results ([#144](https://github.com/scribe-org/Scribe-Data/issues/144)).
-    - The version of the CLI can be printed to the command line and the CLI can further be used to upgrade itself ([#186](https://github.com/scribe-org/Scribe-Data/issues/186), [#157 ](https://github.com/scribe-org/Scribe-Data/issues/157)).
+  - The output type of queries can be in JSON, CSV, TSV and SQLite, with conversions output types also being possible ([#145](https://github.com/scribe-org/Scribe-Data/issues/145), [#146](https://github.com/scribe-org/Scribe-Data/issues/146))
+  - Output paths can be set for query results ([#144](https://github.com/scribe-org/Scribe-Data/issues/144)).
+  - The version of the CLI can be printed to the command line and the CLI can further be used to upgrade itself ([#186](https://github.com/scribe-org/Scribe-Data/issues/186), [#157 ](https://github.com/scribe-org/Scribe-Data/issues/157)).
   - Total Wikidata lexemes for languages and data types can be derived with the `--total` command ([#147](https://github.com/scribe-org/Scribe-Data/issues/147)).
   - Commands can be used via an interactive mode with the `--interactive` command ([#158](https://github.com/scribe-org/Scribe-Data/issues/158)).
 - Articles are removed from machine translations so they're more directly useful in Scribe applications ([#96](https://github.com/scribe-org/Scribe-Data/issues/96)).
diff --git a/docs/source/_static/CONTRIBUTING.rst b/docs/source/_static/CONTRIBUTING.rst
index 4a34e1ffc..8cade2b22 100644
--- a/docs/source/_static/CONTRIBUTING.rst
+++ b/docs/source/_static/CONTRIBUTING.rst
@@ -16,7 +16,7 @@ Contents
 -  `First steps as a contributor <#first-steps-as-a-contributor>`__
 -  `Learning the tech stack <#learning-the-tech-stack>`__
 -  `Development environment <#development-environment>`__
--  `Issues and projects <#issues-projects>`__
+-  `Issues and projects <#issues-and-projects>`__
 -  `Bug reports <#bug-reports>`__
 -  `Feature requests <#feature-requests>`__
 -  `Pull requests <#pull-requests>`__
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 60dbb3922..78613691c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -40,8 +40,11 @@
     "numpydoc",
     "sphinx.ext.viewcode",
     "sphinx.ext.imgmath",
+    "nbsphinx",
 ]
 
+nbsphinx_allow_errors = True
+nbsphinx_execute = "never"
 numpydoc_show_inherited_class_members = False
 numpydoc_show_class_members = False
 
diff --git a/docs/source/notes.rst b/docs/source/notes.rst
index 9d9aa20d0..f72b2f2a8 100644
--- a/docs/source/notes.rst
+++ b/docs/source/notes.rst
@@ -1,4 +1,4 @@
-.. mdinclude:: _static/CONTRIBUTING.rst
+.. include:: _static/CONTRIBUTING.rst
 
 License
 =======
@@ -6,4 +6,4 @@ License
 .. literalinclude:: ../../LICENSE.txt
     :language: text
 
-.. mdinclude:: ../../CHANGELOG.md
+.. include:: ../../CHANGELOG.md
diff --git a/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst b/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst
index 5f4c90b00..e69334a18 100644
--- a/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst
+++ b/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst
@@ -3,9 +3,10 @@ gen_autosuggestions.ipynb
 
 `View code on Github <https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/wikipedia/gen_autosuggestions.ipynb>`_
 
-Scribe Autosuggest Generation
------------------------------
-
 This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps.
 
+.. toctree::
+
+   notebook.ipynb
+
 Use the :code:`View code on GitHub` link above to view the notebook and explore the process!
diff --git a/docs/source/scribe_data/wikipedia/notebook.ipynb b/docs/source/scribe_data/wikipedia/notebook.ipynb
new file mode 100644
index 000000000..cdff0eb23
--- /dev/null
+++ b/docs/source/scribe_data/wikipedia/notebook.ipynb
@@ -0,0 +1,308 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "743abe55",
+   "metadata": {
+    "toc": true
+   },
+   "source": [
+    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
+    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Download-and-Parse-Wiki\" data-toc-modified-id=\"Download-and-Parse-Wiki-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Download and Parse Wiki</a></span></li><li><span><a href=\"#Process-and-Clean\" data-toc-modified-id=\"Process-and-Clean-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Process and Clean</a></span></li><li><span><a href=\"#Generate-and-Upload\" data-toc-modified-id=\"Generate-and-Upload-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Generate and Upload</a></span></li></ul></div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "592e4b36",
+   "metadata": {},
+   "source": [
+    "**Scribe Autosuggest Generation**\n",
+    "\n",
+    "This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bec5ff38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c8c7a44",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-10T19:52:39.142528Z",
+     "start_time": "2023-04-10T19:52:39.087499Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import json\n",
+    "\n",
+    "from tqdm.auto import tqdm\n",
+    "from IPython.core.display import display, HTML\n",
+    "display(HTML(\"<style>.container { width:99% !important; }</style>\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14a5bf58",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-10T19:52:39.147871Z",
+     "start_time": "2023-04-10T19:52:39.144127Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pwd = os.path.dirname(os.path.realpath(\"gen_autosuggestions.ipynb\"))\n",
+    "pwd = pwd.split(\"scribe_data\")[0]\n",
+    "sys.path.append(pwd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c7939bd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-10T19:52:52.508933Z",
+     "start_time": "2023-04-10T19:52:52.506137Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from scribe_data.wikipedia.extract_wiki import (\n",
+    "    download_wiki,\n",
+    "    parse_to_ndjson,\n",
+    ")\n",
+    "from scribe_data.wikipedia.process_wiki import (\n",
+    "    clean,\n",
+    "    gen_autosuggestions,\n",
+    ")\n",
+    "from scribe_data.utils import get_language_iso"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2add942e",
+   "metadata": {},
+   "source": [
+    "# Download and Parse Wiki"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a722df43",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-10T19:53:16.467643Z",
+     "start_time": "2023-04-10T19:53:16.464619Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Languages: French, German, Italian, Portuguese, Russian, Spanish, Swedish\n",
+    "language = \"French\"\n",
+    "language_abbr = get_language_iso(language)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11546a55",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-10T19:53:28.138818Z",
+     "start_time": "2023-04-10T19:53:17.184354Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "files = download_wiki(\n",
+    "    language=language,\n",
+    "    target_dir=f\"./{language_abbr}wiki_dump\",\n",
+    "    file_limit=None, # None is all files\n",
+    "    dump_id=\"20220920\"\n",
+    ")\n",
+    "print(f\"Number of files: {len(files)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b40fd9d9",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-10-03T12:25:23.192390Z",
+     "start_time": "2022-10-03T12:25:23.189124Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "parse_to_ndjson(\n",
+    "    output_path=f\"./{language_abbr}wiki.ndjson\",\n",
+    "    input_dir=f\"./{language_abbr}wiki_dump\",\n",
+    "    partitions_dir=f\"./{language_abbr}wiki_partitions\",\n",
+    "    article_limit=None, # None is all articles\n",
+    "    delete_parsed_files=True,\n",
+    "    multicore=True,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c3f2f51",
+   "metadata": {},
+   "source": [
+    "# Process and Clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "205a01b4",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-10-03T12:25:27.126Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "with open(f\"./{language_abbr}wiki.ndjson\", \"r\") as fin:\n",
+    "    article_texts = [\n",
+    "        json.loads(lang)[1] for lang in tqdm(fin, desc=\"Articles added\", unit=\"articles\")\n",
+    "    ]\n",
+    "\n",
+    "print(f\"Number of articles: {len(article_texts)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1b869f4",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-10-03T12:25:34.201Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Define sample size for up to 1 million articles.\n",
+    "sample_size = 1000000 / len(article_texts)\n",
+    "sample_size = min(sample_size, 1)\n",
+    "sample_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea9ea16c",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-10-03T12:25:40.574Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "text_corpus = clean(\n",
+    "    texts=article_texts,\n",
+    "    language=language,\n",
+    "    remove_words=None,\n",
+    "    sample_size=sample_size,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "593e855d",
+   "metadata": {},
+   "source": [
+    "# Generate and Upload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cda9e874",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-10-03T12:25:54.735Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "autosuggest_dict = gen_autosuggestions(\n",
+    "    text_corpus,\n",
+    "    language=language,\n",
+    "    num_words=1000,\n",
+    "    ignore_words=None,\n",
+    "    update_local_data=True,\n",
+    "    verbose=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8c385b7",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-10-03T12:25:55.451Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# autosuggest_dict"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": true,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/requirements.txt b/requirements.txt
index abbd5e443..03ed90a90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ flax>=0.8.2
 iso639-lang>=2.2.3
 m2r2>=0.3.3
 mwparserfromhell>=0.6
+nbsphinx>=0.9.5
 numpydoc>=1.6.0
 packaging>=20.9
 pandas>=1.5.3
diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py
index 223a40fec..21f550e5f 100644
--- a/src/scribe_data/unicode/process_unicode.py
+++ b/src/scribe_data/unicode/process_unicode.py
@@ -2,6 +2,7 @@
 Module for processing Unicode based corpuses for autosuggestion and autocompletion generation.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/unicode/unicode_utils.py b/src/scribe_data/unicode/unicode_utils.py
index 0449fcb94..b03f13c8b 100644
--- a/src/scribe_data/unicode/unicode_utils.py
+++ b/src/scribe_data/unicode/unicode_utils.py
@@ -2,6 +2,7 @@
 Module for a function to get emojis we want to filter from suggestions.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index c7f64e0c6..8c13549c9 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -2,6 +2,7 @@
 Utility functions for data extraction, formatting and loading.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/wikidata/check_query/check.py b/src/scribe_data/wikidata/check_query/check.py
index dba1a3273..41f1706af 100644
--- a/src/scribe_data/wikidata/check_query/check.py
+++ b/src/scribe_data/wikidata/check_query/check.py
@@ -2,6 +2,7 @@
 Command line tool for testing SPARQl queries against an endpoint.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/wikidata/check_query/query.py b/src/scribe_data/wikidata/check_query/query.py
index eedb2b195..6e5e6fc87 100644
--- a/src/scribe_data/wikidata/check_query/query.py
+++ b/src/scribe_data/wikidata/check_query/query.py
@@ -2,6 +2,7 @@
 Classes and methods for querying a file in the query check process.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/wikidata/check_query/sparql.py b/src/scribe_data/wikidata/check_query/sparql.py
index 3ef2ff86e..f702907f8 100644
--- a/src/scribe_data/wikidata/check_query/sparql.py
+++ b/src/scribe_data/wikidata/check_query/sparql.py
@@ -2,6 +2,7 @@
 Functions for running SPARQL queries within the query check process.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
index a9dba0b9f..ac1de6d26 100644
--- a/src/scribe_data/wikidata/query_data.py
+++ b/src/scribe_data/wikidata/query_data.py
@@ -2,6 +2,7 @@
 Updates data for Scribe by running all or desired WDQS queries and formatting scripts.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py
index 9f02d2a4f..b90e75e24 100644
--- a/src/scribe_data/wikipedia/extract_wiki.py
+++ b/src/scribe_data/wikipedia/extract_wiki.py
@@ -2,6 +2,7 @@
 Module for downloading and creating workable files from Wikipedia dumps.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *
diff --git a/src/scribe_data/wikipedia/gen_autosuggestions.ipynb b/src/scribe_data/wikipedia/gen_autosuggestions.ipynb
index 0e77ed7d8..660ec05c7 100644
--- a/src/scribe_data/wikipedia/gen_autosuggestions.ipynb
+++ b/src/scribe_data/wikipedia/gen_autosuggestions.ipynb
@@ -281,9 +281,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:scribe-data-dev] *",
+   "display_name": "",
    "language": "python",
-   "name": "conda-env-scribe-data-dev-py"
+   "name": ""
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/src/scribe_data/wikipedia/process_wiki.py b/src/scribe_data/wikipedia/process_wiki.py
index 1dfa110ac..bd5bbb162 100644
--- a/src/scribe_data/wikipedia/process_wiki.py
+++ b/src/scribe_data/wikipedia/process_wiki.py
@@ -2,6 +2,7 @@
 Module for cleaning Wikipedia based corpuses for autosuggestion generation.
 
 .. raw:: html
+
     <!--
     * Copyright (C) 2024 Scribe
     *

From 6580405a117fa6a3d2862e0205965d5ebcfc5e59 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Wed, 23 Oct 2024 00:59:19 +0200
Subject: [PATCH 2/2] #235 Tajik adjectives query

---
 .../Tajik/adjectives/query_adjectives.sparql        | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 src/scribe_data/language_data_extraction/Tajik/adjectives/query_adjectives.sparql

diff --git a/src/scribe_data/language_data_extraction/Tajik/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Tajik/adjectives/query_adjectives.sparql
new file mode 100644
index 000000000..fe64a1935
--- /dev/null
+++ b/src/scribe_data/language_data_extraction/Tajik/adjectives/query_adjectives.sparql
@@ -0,0 +1,13 @@
+# tool: scribe-data
+# All Tajik (Q9260) adjectives (Q34698) and the given forms.
+# Enter this query at https://query.wikidata.org/.
+
+SELECT DISTINCT
+  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+  ?adjective
+
+WHERE {
+  ?lexeme dct:language wd:Q9260 ;
+    wikibase:lexicalCategory wd:Q34698 ;
+    wikibase:lemma ?adjective .
+}