diff --git a/src/llm_datasets/datasets/bg/bgnc_admin_eur.py b/src/llm_datasets/datasets/bg/bgnc_admin_eur.py deleted file mode 100644 index 879ab71..0000000 --- a/src/llm_datasets/datasets/bg/bgnc_admin_eur.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging - -from tqdm.auto import tqdm - -from llm_datasets.datasets.base import MB, Availability, BaseDataset - -logger = logging.getLogger(__name__) - - -# deprecated -> use bulnc instead! -class BGNCAdminEURDataset(BaseDataset): - """Part of Bulgarian National Corpus - - TODO overlap with eurlex_bg? - """ - - DATASET_ID = "bgnc_admin_eur" - TITLE = "ADMIN_EUR Corpus of EU legislation (bg)" - HOMEPAGE = "https://eur-lex.europa.eu/homepage.html" - - AVAILIBILITY = Availability.DIRECT_DOWNLOAD - - LANGUAGES = ["bg"] - HAS_OVERLAP_WITH = [ - "bulnc", - ] - DOWNLOAD_URLS = ["https://dcl.bas.bg/BulNC-registration/dl.php?dl=feeds/ADMIN_EUR.BG.zip"] - - BYTES = 257 * MB - - def download(self): - """DOWNLOAD - ----------- - - Instruction - - - Downloaded locally by clicking on the download link in the browser: - - https://dcl.bas.bg/BulNC-registration/dl.php?dl=feeds/ADMIN_EUR.BG.zip - - - Copy local file to server: - - scp /Local/Path/to/ADMIN_EUR.BG.zip username@clustername:/data/datasets/ele/bg/BgNC/admin_eur - - - Extract files: - - unzip ADMIN_EUR.BG.zip - - """ - pass - - def decompress(self): - # unzip ADMIN_EUR.BG.zip - pass - - def get_texts(self): - files_path = self.get_dataset_file_paths(subdirectories=True, needed_suffix=".txt") - - logger.info(f"Found {len(files_path):,} files") - - for input_file in tqdm(files_path, desc="Reading files"): - # skip if is metadata - if "METADATA" in input_file: - logger.warning(f"Skip {input_file}") - continue - - # each file is one documentt - with open(input_file, "r") as inp: - text = inp.read() - yield text.strip() diff --git a/src/llm_datasets/datasets/bg/bgnc_news_corpus.py b/src/llm_datasets/datasets/bg/bgnc_news_corpus.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/llm_datasets/datasets/bg/bulgarian_news.py b/src/llm_datasets/datasets/bg/bulgarian_news.py deleted file mode 100644 index 8dba995..0000000 --- a/src/llm_datasets/datasets/bg/bulgarian_news.py +++ /dev/null @@ -1,44 +0,0 @@ -import json -import logging -from pathlib import Path - -from llm_datasets.datasets.base import MB, Availability, BaseDataset, License - -logger = logging.getLogger(__name__) - - -class BulgarianNewsDataset(BaseDataset): - DATASET_ID = "bulgarian_news" - TITLE = "Crawl of Bulgarian news websites" - DOWNLOAD_URLS = ["http://old.dcl.bas.bg/dataset/Bulgarian_news.7z"] - DESCRIPTION = ( - "The collection was collected by crawling Bulgarian websites in Bulgarian. Text samples are in json format. We" - " can provide raw tests." - ) - WEB_CRAWLED = True - LANGUAGES = ["bg"] - BYTES = 919 * MB - AVAILIBILITY = Availability.ON_REQUEST - LICENSE = License("research only") - - def decompress(self): - # 7z x Bulgarian_news.7z - pass - - def get_texts(self): - # read from extracted JSON files - for i, file_path in enumerate(Path(self.get_local_dataset_dir()).rglob("*.json")): - if self.skip_items > 0 and i < self.skip_items: - continue - - with open(file_path) as f: - try: - doc = json.load(f) - if "bg_a_text" in doc: - text = self.paragraph_delimiter.join(doc["bg_a_text"]) - yield text - else: - logger.warning("JSON has no text field: %s", file_path) - - except ValueError: - logger.error("Cannot parse JSON from %s", file_path) diff --git a/src/llm_datasets/datasets/bg/bulnc.py b/src/llm_datasets/datasets/bg/bulnc.py deleted file mode 100644 index 6e91f5c..0000000 --- a/src/llm_datasets/datasets/bg/bulnc.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -from pathlib import Path - -from llm_datasets.datasets.base import GB, Availability, BaseDataset, License - -logger = logging.getLogger(__name__) - - -class BulNCDataset(BaseDataset): - DATASET_ID = "bulnc" - TITLE = "Bulgarian National Corpus" - AVAILIBILITY = Availability.ON_REQUEST - DOWNLOAD_URLS = ["http://old.dcl.bas.bg/dataset/BulNC.7z"] # password-protected file! - DESCRIPTION = ( - "The Bulgarian National Corpus contains a wide range of texts in various sizes, media types (written and " - "spoken), styles, periods (synchronic and diachronic), and licenses. Each text in the collection is supplied " - "with metadata. The Bulgarian National Corpus was first compiled using the Bulgarian Lexicographic Archive " - "and the Text Archive of Written Bulgarian, which account for 55.95% of the corpus. Later, the EMEA corpus " - "(medical administrative texts) and the OpenSubtitles corpus (film subtitles) were added, accounting for " - "1.27% and 8.61% of the BulNC, respectively. The remaining texts were crawled automatically and include a " - "large number of administrative texts, news from monolingual and multilingual sources, scientific texts, and " - "popular science. The BulNC is not fully downloadable due to the inclusion of copyrighted material. We've " - "provided a link to a password-protected archive for evaluation." - ) - AVAILIBILITY = Availability.ON_REQUEST - LICENSE = License("research only", sharealike=False) - LANGUAGES = ["bg"] - BYTES = 1.8 * GB - - def decompress(self): - """7z x BulNC.7z - - Folders: 125 - Files: 256906 - Size: 13279357395 - Compressed: 1981942477 - """ - pass - - def get_texts(self): - # read from extracted TXT files - for file_path in Path(self.get_local_dataset_dir()).rglob( - "*.txt" - ): # self.get_dataset_file_paths(subdirectories=True, needed_suffix=".txt"): - with open(file_path) as f: - text = f.read() - - yield text diff --git a/src/llm_datasets/datasets/dataset_registry.py b/src/llm_datasets/datasets/dataset_registry.py index 1a237a0..aec6d43 100644 --- a/src/llm_datasets/datasets/dataset_registry.py +++ b/src/llm_datasets/datasets/dataset_registry.py @@ -76,11 +76,7 @@ ".en.pile_of_law.PileOfLawDataset", ".en.math_amps.MathAMPSDataset", ".en.edgar.EdgarCorpus", - # bg - # ".bg.bgnc_admin_eur.BGNCAdminEURDataset", # deprecated -> use bulnc - # ".bg.bgnc_news_corpus.BGNCNewsCorpusDataset", # deprecated -> use bulnc - ".bg.bulgarian_news.BulgarianNewsDataset", - ".bg.bulnc.BulNCDataset", + # bg: all removed # de ".de.openlegaldata.OpenLegalDataDataset", ".de.dewac.DEWacDataset",