scribe-org · SethiShreya · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/.env b/.env
@@ -0,0 +1,2 @@
+SRCPATH=D:\Outreachy\Scribe-Data\src
+PYTHONPATH=D:\Outreachy\Scribe-Data\venv\Lib\site-packages
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -168,6 +168,31 @@ Note that you may need to run this command every time you make any change to the
 pip install -e .
 ```
 
+## Set Python path for Windows
+```bash
+# open the .env file where you would see srcpath and pythonpath
+
+SRCPATH=<your-file-path>Scribe-Data\src
+PYTHONPATH=<your-file-path>\Scribe-Data\<your-virtual-env-name>\Lib\site-packages
+```
+
+Configure your activate.bat (for cmd) and Activate.ps1 (for powershell) files present at .\venv\Scripts
+
+- Configure Activate.ps1 file
+```bash
+# add this line to activate.bat file under set VIRTUAL_ENV
+
+set PYTHONPATH=%SRCPATH%;%PYTHONPATH%
+```
+
+- Configure Activate.ps1 file
+
+```bash
+#add this line to activate.ps1 file above deactivate function
+
+$env:PYTHONPATH = "$env:SRCPATH;$env:PYTHONPATH"
+```
+
 > [!NOTE]
 > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
 

diff --git a/src/scribe_data/language_data_extraction/English/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/English/emoji_keywords/generate_emoji_keywords.py
@@ -24,11 +24,15 @@
 
 from scribe_data.unicode.process_unicode import gen_emoji_lexicon
 from scribe_data.utils import export_formatted_data
+from scribe_data.utils import (
+    DEFAULT_JSON_EXPORT_DIR,
+)
 
 LANGUAGE = "English"
 DATA_TYPE = "emoji-keywords"
 emojis_per_keyword = 3
 
+
 parser = argparse.ArgumentParser()
 parser.add_argument("--file-path")
 args = parser.parse_args()
@@ -38,7 +42,7 @@
     emojis_per_keyword=emojis_per_keyword,
 ):
     export_formatted_data(
-        file_path=args.file_path,
+        file_path=DEFAULT_JSON_EXPORT_DIR,
         formatted_data=emoji_keywords_dict,
         query_data_in_use=True,
         language=LANGUAGE,

diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py
@@ -75,7 +75,9 @@ def gen_emoji_lexicon(
     # Pre-set up the emoji popularity data.
     popularity_dict = {}
 
-    with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file:
+    with (Path(__file__).parent / "2021_ranked.tsv").open(
+        encoding="utf-8"
+    ) as popularity_file:
         tsv_reader = csv.DictReader(popularity_file, delimiter="\t")
         for tsv_row in tsv_reader:
             popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"])
@@ -106,7 +108,7 @@ def gen_emoji_lexicon(
     }
 
     for cldr_file_key, cldr_file_path in cldr_file_paths.items():
-        with open(cldr_file_path, "r") as file:
+        with open(cldr_file_path, "r", encoding="utf-8") as file:
             cldr_data = json.load(file)
 
         cldr_dict = cldr_data[cldr_file_key]["annotations"]
@@ -184,9 +186,9 @@ def gen_emoji_lexicon(
             noun_data = json.load(f)
 
         plurals_to_singulars_dict = {
-            noun_data[row]["plural"].lower(): row.lower()
-            for row in noun_data
-            if noun_data[row]["plural"] != "isPlural"
+            noun["singular"].lower(): noun["lexemeID"].lower()
+            for noun in noun_data
+            if noun.get("singular")
         }
 
         for plural, singular in plurals_to_singulars_dict.items():