Skip to content

Commit

Permalink
Added generation script with seed to ensure the same qurstions are ge…
Browse files Browse the repository at this point in the history
…nerated every time.

Generated a new set of evals based on the new generation script
  • Loading branch information
sakher-sawan committed May 17, 2024
1 parent 1b3f4a9 commit 6c76796
Show file tree
Hide file tree
Showing 9 changed files with 7,435 additions and 8 deletions.
1 change: 1 addition & 0 deletions evals/registry/data/quran_eval/gen_script/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
generated
220 changes: 220 additions & 0 deletions evals/registry/data/quran_eval/gen_script/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import pandas as pd
import json
import random
import re
import os

SEED = 42

def load_quran_data(file_path):
return pd.read_csv(file_path, header=None, names=['chapter', 'verse', 'text'], sep='|')

def load_chapter_names(file_path):
return pd.read_json(file_path)

def extract_random_ayas(df, number_of_ayas):
random.seed(SEED)
return df.sample(n=number_of_ayas, random_state=SEED)

def load_distractors(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
distractors = json.load(file)
return distractors

def generate_mcq_questions(aya, distractors_list, n=3):
random.seed(SEED)
correct_answer = aya['text']
valid_distractors = [d for d in distractors_list if d != correct_answer]
selected_distractors = random.sample(valid_distractors, n)

options = selected_distractors + [correct_answer]
random.shuffle(options)

option_labels = ['A', 'B', 'C', 'D'][:len(options)]
labeled_options = {label: option for label, option in zip(option_labels, options)}

options_text = "; ".join([f"{label}. {option}" for label, option in labeled_options.items()])
question_content_en = f"Which of the following is a text from the Quran? {options_text}, please answer with the letter of the correct option (A, B, C, D) only"
question_content_ar = f"أي من التالي هو نص من القرآن؟ {options_text}, يرجى الإجابة بحرف الخيار الصحيح (A, B, C, D) فقط"

correct_label = [label for label, option in labeled_options.items() if option == correct_answer][0]

return question_content_en, question_content_ar, correct_label

def redact_aya(text, all_texts):
random.seed(SEED)
words = text.split()
if len(words) <= 1:
return None

for _ in range(100):
start = random.randint(0, len(words) - 1)
end = random.randint(start + 1, len(words))
first_section = ' '.join(words[:start])
missing_section = ' '.join(words[start:end])
third_section = ' '.join(words[end:])
redacted_aya = f"{first_section} ________ {third_section}".strip()

pattern = re.escape(first_section) + '.*' + re.escape(third_section)
if len([t for t in all_texts if re.match(pattern, t)]) == 1:
return redacted_aya, first_section, missing_section, third_section

return None

def generate_bilingual_questions(ayas_df, question_type):
random.seed(SEED)
bilingual_questions = []
half_length = len(ayas_df) // 2
include_extra_info = True

for index, row in ayas_df.iterrows():
extra_info_en = f" This text is from Surah {row['name']} (Chapter {row['chapter']}, Verse {row['verse']})." if include_extra_info else ""
extra_info_ar = f" هذا النص القرآني من سورة {row['name']} (السورة {row['chapter']}، الآية {row['verse']})." if include_extra_info else ""

if question_type == "missing_text":
question_content_en = f"Fill in the blank of the following Quranic text: (({row['redacted']})) to complete the full verse.{extra_info_en}. The answer may be one or more words."
question_content_ar = f"املأ الفراغ في النص القرآني التالي: (({row['redacted']})) لإتمام الآية كاملة.{extra_info_ar}. قد تكون الإجابة عبارة عن كلمة واحدة أو أكثر."
ideal_answer = [row['missing_section']]
ideal_answer_ar = [row['missing_section']]

elif question_type == "surah_name":
question_content_en = f"Identify the Surah (in Arabic) of the following Quranic text: {row['text']} (Please provide the answer without diacritics but keep hamza and madda)."
question_content_ar = f"حدد اسم السورة للنص القرآني التالي: {row['text']} (يرجى تقديم الإجابة بدون تشكيل ولكن احتفظ بالهمزة والمد)."
ideal_answer = [row['name'], row['transliteration'], row['translation']]
ideal_answer_ar = [row['name'], row['transliteration'], row['translation']]

elif question_type == "surah_type":
question_content_en = f"Determine if the Surah of the following Quranic aya text is meccan or madinan: {row['text']} answer only with either 'meccan' or 'madinan' (exactly in small case)."
question_content_ar = f"حدد إذا كانت السورة للنص القرآني التالي مكية أو مدنية: {row['text']} أجب فقط بـ 'مكية' أو 'مدنية' (بدون تشكيل)."
answer_arabic_translations = ['مكية', 'مكي', 'مكة'] if row['type'] == 'meccan' else ['مدنية', 'مدني', 'المدينة']
all_answers = [row['type']] + answer_arabic_translations
ideal_answer = all_answers
ideal_answer_ar = all_answers

elif question_type == "mcq":
question_content_en, question_content_ar, correct_label = generate_mcq_questions(row, distractors_list)
ideal_answer = [correct_label]
ideal_answer_ar = [correct_label]

# Creating questions in both English and Arabic
if index < half_length: # English questions
bilingual_questions.append({
"input": [
{"role": "system", "content": question_content_en},
{"role": "user", "content": "Please provide the answer, and ONLY the answer without any extra commentary" if question_type != "mcq" else "Please provide the answer by selecting the correct letter (A, B, C, or D) without any extra commentary"}
],
"ideal": ideal_answer
})
else: # Arabic questions
bilingual_questions.append({
"input": [
{"role": "system", "content": question_content_ar},
{"role": "user", "content": "يرجى تقديم الإجابة. وفقط الإجابة دون أي تعليق إضافي" if question_type != "mcq" else "يرجى تقديم الإجابة عن طريق تحديد الحرف الصحيح (A, B, C, أو D) دون أي تعليق إضافي"}
],
"ideal": ideal_answer_ar
})

# Toggle extra info for next question
include_extra_info = not include_extra_info

return bilingual_questions

if __name__ == '__main__':
# Main process
quran_file_path = 'evals/registry/data/quran_eval/gen_script/resources/Arabic-Original.csv'
chapters_file_path = 'evals/registry/data/quran_eval/gen_script/resources/chapters-en.json'
distractors_file_path = 'evals/registry/data/quran_eval/gen_script/resources/distractors_not_quranic.json'

random.seed(SEED)

# Load and prepare data
quran_df = load_quran_data(quran_file_path)
chapters_df = load_chapter_names(chapters_file_path)
random_ayas_df = extract_random_ayas(quran_df, 350)
distractors_list = load_distractors(distractors_file_path)

random_ayas_df = random_ayas_df.merge(chapters_df, left_on='chapter', right_on='id')
random_ayas_df.drop(columns=['id', 'total_verses'], inplace=True)

# Apply the redaction process and validation
all_texts = quran_df['text'].tolist()
validated_ayas = []

for index, row in random_ayas_df.iterrows():
result = redact_aya(row['text'], all_texts)
if result:
row['redacted'], row['first_section'], row['missing_section'], row['third_section'] = result
pattern = row["text"]
if len([t for t in all_texts if re.match(pattern, t)]) == 1:
validated_ayas.append(row)

validated_ayas_df = pd.DataFrame(validated_ayas)

# Generate bilingual questions
bilingual_missing_text_questions = generate_bilingual_questions(validated_ayas_df, "missing_text")
bilingual_surah_name_questions = generate_bilingual_questions(validated_ayas_df, "surah_name")
bilingual_surah_type_questions = generate_bilingual_questions(validated_ayas_df, "surah_type")
# Generate MCQ questions
question_type = "mcq"
mcq_questions = generate_bilingual_questions(random_ayas_df, question_type)

# Save the questions to separate JSON files
readable_bilingual_missing_text_file_path = 'evals/registry/data/quran_eval/gen_script/generated/masked_quranic_text.json'
readable_bilingual_surah_name_file_path = 'evals/registry/data/quran_eval/gen_script/generated/guess_quran_surah_name.json'
readable_bilingual_surah_type_file_path = 'evals/registry/data/quran_eval/gen_script/generated/guess_quran_surah_type.json'
readable_biligual_questions_mcq_file_path = 'evals/registry/data/quran_eval/gen_script/generated/guess_which_text_is_from_quran.json'

output_folder = 'evals/registry/data/quran_eval/gen_script/generated'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)

with open(readable_bilingual_missing_text_file_path, 'w', encoding='utf-8') as file:
json.dump(bilingual_missing_text_questions, file, ensure_ascii=False, indent=4)

with open(readable_bilingual_surah_name_file_path, 'w', encoding='utf-8') as file:
json.dump(bilingual_surah_name_questions, file, ensure_ascii=False, indent=4)

with open(readable_bilingual_surah_type_file_path, 'w', encoding='utf-8') as file:
json.dump(bilingual_surah_type_questions, file, ensure_ascii=False, indent=4)

with open(readable_biligual_questions_mcq_file_path, 'w', encoding='utf-8') as file:
json.dump(mcq_questions, file, ensure_ascii=False, indent=4)

# Final output paths for each question type
missing_text_output_jsonl = 'evals/registry/data/quran_eval/masked_quranic_text.jsonl'
surah_name_output_jsonl = 'evals/registry/data/quran_eval/guess_quran_surah_name.jsonl'
surah_type_output_jsonl = 'evals/registry/data/quran_eval/guess_quran_surah_type.jsonl'
mcq_output_jsonl = 'evals/registry/data/quran_eval/guess_which_text_is_from_quran.jsonl'

output_folder = 'evals/registry/data/quran_eval'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)

# Save the questions in JSON Lines format for each question type
with open(missing_text_output_jsonl, 'w', encoding='utf-8') as file:
for question in bilingual_missing_text_questions:
# Write each question as a separate line in the file
json_line = json.dumps(question, ensure_ascii=False)
file.write(json_line + '\n')

with open(surah_name_output_jsonl, 'w', encoding='utf-8') as file:
for question in bilingual_surah_name_questions:
# Write each question as a separate line in the file
json_line = json.dumps(question, ensure_ascii=False)
file.write(json_line + '\n')

with open(surah_type_output_jsonl, 'w', encoding='utf-8') as file:
for question in bilingual_surah_type_questions:
# Write each question as a separate line in the file
json_line = json.dumps(question, ensure_ascii=False)
file.write(json_line + '\n')

with open(mcq_output_jsonl, 'w', encoding='utf-8') as file:
for question in mcq_questions:
# Write each question as a separate line in the file
json_line = json.dumps(question, ensure_ascii=False)
file.write(json_line + '\n')
Loading

0 comments on commit 6c76796

Please sign in to comment.