Skip to content

Commit

Permalink
Merge pull request #60 from shashank-iitbhu/langcodes-implementation
Browse files Browse the repository at this point in the history
fixes #55 : Refactor ISO code usage using Python langcodes
  • Loading branch information
andrewtavis authored Jan 27, 2024
2 parents fac762e + 2a5b295 commit c660feb
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 14 deletions.
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ dependencies:
- regex>=2023.8.8
- SPARQLWrapper>=2.0.0
- tensorflow>=2.11.0
- langcodes>=3.0.0
- language_data>=1.0.0
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ regex>=2023.3.23
sentencepiece>=0.1.95
SPARQLWrapper>=2.0.0
tabulate>=0.8.9
tensorflow>=2.5.1
tensorflow>=2.11.0
tqdm==4.56.1
transformers>=4.12
langcodes>=3.0.0
language_data>=1.0.0
25 changes: 12 additions & 13 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from importlib import resources
from pathlib import Path
from typing import Any
import langcodes
from langcodes import *

PROJECT_ROOT = "Scribe-Data"

Expand Down Expand Up @@ -154,13 +156,11 @@ def get_language_iso(language: str) -> str:
str
The ISO code for the language.
"""
return _find(
"language",
language,
"iso",
f"{language.capitalize()} is currently not a supported language for ISO conversion.",
)

try:
iso_code = str(langcodes.find(language).language)
except LookupError:
raise ValueError(f"{language.capitalize()} is currently not a supported language for ISO conversion.")
return iso_code

def get_language_from_iso(iso: str) -> str:
"""
Expand All @@ -176,12 +176,11 @@ def get_language_from_iso(iso: str) -> str:
str
The name for the language which has an ISO value of iso.
"""
return _find(
"iso",
iso,
"language",
f"{iso.upper()} is currently not a supported ISO language.",
).capitalize()

language_name = str(Language.make(language=iso).display_name())
if "Unknown language" in str(language_name):
raise ValueError(f"{iso.upper()} is currently not a supported ISO language.")
return language_name


def get_language_words_to_remove(language: str) -> list[str]:
Expand Down
3 changes: 3 additions & 0 deletions tests/load/test_update_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import unittest
import pytest

import sys
sys.path.append('../../src')

from scribe_data import utils


Expand Down

0 comments on commit c660feb

Please sign in to comment.