-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsample_load_no_miracl.py
73 lines (60 loc) · 1.93 KB
/
sample_load_no_miracl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Usage example: python load_no_miracl.py
"""
from nomiracl.dataset import NoMIRACLDataLoader
from nomiracl import LoggingHandler
import logging, os
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
handlers=[LoggingHandler()],
)
language = "en" # Language of the dataset
split = "test" # Split of the dataset: dev, test
relevant_ratio = 0.5 # Ratio of relevant samples
non_relevant_ratio = 0.5 # Ratio of non-relevant samples
max_sample_pool = 250 # Maximum cap of samples to load for each subset
# Technique 1: Download the NoMIRACL dataset directly from the web and unzip
data_dir = "/mnt/users/n3thakur/research_new/miracl-unanswerable/no-miracl" # Directory to save the dataset
data_dir = os.path.join(data_dir, language)
data_loader = NoMIRACLDataLoader(data_dir=data_dir, split=split)
corpus, queries, qrels = data_loader.load_data_sample(
relevant_ratio=relevant_ratio,
non_relevant_ratio=non_relevant_ratio,
max_sample_pool=max_sample_pool,
)
# Technique 2: Load the NoMIRACL dataset from Huggingface
# NoMIRACL (HuggingFace): https://huggingface.co/datasets/miracl/nomiracl
language_code_map = {
"ar": "arabic",
"bn": "bengali",
"de": "german",
"en": "english",
"es": "spanish",
"fa": "persian",
"fi": "finnish",
"fr": "french",
"hi": "hindi",
"id": "indonesian",
"ja": "japanese",
"ko": "korean",
"ru": "russian",
"sw": "swahili",
"te": "telugu",
"th": "thai",
"yo": "yoruba",
"zh": "chinese",
}
data_loader = NoMIRACLDataLoader(
language=language_code_map[language],
split=split,
hf_dataset_name="miracl/nomiracl",
load_from_huggingface=True,
)
corpus, queries, qrels = data_loader.load_data_sample(
relevant_ratio=relevant_ratio,
non_relevant_ratio=non_relevant_ratio,
max_sample_pool=max_sample_pool,
)