diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..a32742a
Binary files /dev/null and b/.DS_Store differ
diff --git a/misc/.DS_Store b/misc/.DS_Store
new file mode 100644
index 0000000..a7f5ee5
Binary files /dev/null and b/misc/.DS_Store differ
diff --git a/misc/NEET/.DS_Store b/misc/NEET/.DS_Store
new file mode 100644
index 0000000..f894b1c
Binary files /dev/null and b/misc/NEET/.DS_Store differ
diff --git a/misc/NEET/Generator_Template.ipynb b/misc/NEET/Generator_Template.ipynb
new file mode 100644
index 0000000..f0182c7
--- /dev/null
+++ b/misc/NEET/Generator_Template.ipynb
@@ -0,0 +1,1097 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/Users/roshansk/Documents/GitHub/Global_Exams'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import re\n",
+ "import base64\n",
+ "from openai import OpenAI\n",
+ "from anthropic import Anthropic\n",
+ "from pdf2image import convert_from_path\n",
+ "\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "\n",
+ "import os\n",
+ "os.getcwd()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "320b7be4-6a54-4169-8666-0a2c56359109",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def encode_image(image_path):\n",
+ " \"\"\"\n",
+ " Function to encode the image\n",
+ " \"\"\"\n",
+ " with open(image_path, \"rb\") as image_file:\n",
+ " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+ "\n",
+ "\n",
+ "def get_message_list(base64_string, prompt):\n",
+ "\n",
+ " message_list = [\n",
+ " {\n",
+ " \"role\": 'user',\n",
+ " \"content\": [\n",
+ " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n",
+ " {\"type\": \"text\", \"text\": prompt}\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ "\n",
+ " return message_list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from anthropic import Anthropic\n",
+ "\n",
+ "client = Anthropic()\n",
+ "\n",
+ "# MODEL_NAME = \"claude-3-opus-20240229\"\n",
+ "\n",
+ "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Tamil_G4.pdf'\n",
+ "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_G4_Answer_Key.json'\n",
+ "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Tamil_G4'\n",
+ "\n",
+ "\n",
+ "lang_code = 'ta'\n",
+ "country = 'India'\n",
+ "file_name = 'Paper_20201106090359.pdf'\n",
+ "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106090359.pdf'\n",
+ "license = 'open'\n",
+ "level = 'University'\n",
+ "\n",
+ "output_file_name = 'NEET_2020_Tamil_processed'\n",
+ "\n",
+ "pages_to_include=[2,23]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b26bc26-49bf-4687-a5a9-03a6832ecc18",
+ "metadata": {},
+ "source": [
+ "## Prompt (User input required. Change the prompt value to account for language)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "# \"\"\"\n",
+ "\n",
+ "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Tamil.\n",
+ "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "The json value should have the following keys : \n",
+ "- number : The number of the question\n",
+ "- question : The actual text of the question\n",
+ "- options : A list containing all 4 options for the question\n",
+ "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7",
+ "metadata": {},
+ "source": [
+ "## Creating Image Files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(image_folder):\n",
+ " os.makedirs(image_folder)\n",
+ "\n",
+ "pages = convert_from_path(pdf_file)\n",
+ "\n",
+ "start_page, end_page = pages_to_include \n",
+ "count = 0\n",
+ "for i in range(start_page-1, end_page):\n",
+ " filename = \"page_\"+str(i)+'.jpg'\n",
+ " pages[i].save(Path(image_folder) / filename)\n",
+ " count += 1\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "22 images created\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"{count} images created\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dcbe3632-fee8-473c-a206-c417f638484b",
+ "metadata": {},
+ "source": [
+ "## Extracting questions from Claude"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "1b4d55cc-c997-4614-8405-9b48154f8c80",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 5%|██ | 1/22 [00:31<11:00, 31.45s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_2.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 9%|████ | 2/22 [00:59<09:46, 29.33s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_3.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 14%|██████ | 3/22 [01:21<08:16, 26.14s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_1.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 18%|████████ | 4/22 [01:48<07:55, 26.43s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_4.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 23%|██████████ | 5/22 [02:14<07:25, 26.22s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_5.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 27%|███████████▋ | 6/22 [06:41<28:49, 108.09s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_7.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 32%|██████████████ | 7/22 [06:59<19:40, 78.68s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_6.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 36%|████████████████ | 8/22 [07:26<14:30, 62.17s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_19.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 41%|██████████████████ | 9/22 [07:53<11:06, 51.27s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_18.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 45%|███████████████████▌ | 10/22 [08:04<07:44, 38.67s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_22.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 50%|█████████████████████▌ | 11/22 [08:29<06:19, 34.51s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_20.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 55%|██████████████████████▉ | 12/22 [14:32<22:25, 134.51s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_21.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 59%|████████████████████████▊ | 13/22 [15:04<15:30, 103.41s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_10.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 64%|███████████████████████████▎ | 14/22 [15:27<10:33, 79.20s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_11.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 68%|█████████████████████████████▎ | 15/22 [15:52<07:20, 62.90s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_13.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 73%|███████████████████████████████▎ | 16/22 [16:18<05:11, 51.86s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_12.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 77%|█████████████████████████████████▏ | 17/22 [16:44<03:39, 43.84s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_16.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 82%|██████████████████████████████████▎ | 18/22 [23:51<10:36, 159.11s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_17.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 86%|████████████████████████████████████▎ | 19/22 [24:18<05:58, 119.41s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_15.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 91%|███████████████████████████████████████ | 20/22 [24:46<03:03, 91.92s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_14.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 95%|█████████████████████████████████████████ | 21/22 [25:08<01:10, 70.92s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_8.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|███████████████████████████████████████████| 22/22 [25:31<00:00, 69.61s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_9.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "imageList = os.listdir(image_folder)\n",
+ "\n",
+ "json_output = {}\n",
+ "text_output = {}\n",
+ "\n",
+ "for i in tqdm(range(len(imageList))):\n",
+ " image_path = Path(image_folder) / imageList[i]\n",
+ " base64_string = encode_image(image_path)\n",
+ "\n",
+ " message_list = get_message_list(base64_string, prompt) \n",
+ "\n",
+ " response = client.messages.create(\n",
+ " model=MODEL_NAME,\n",
+ " max_tokens=4096,\n",
+ " messages=message_list,\n",
+ " temperature = 0.0,\n",
+ " top_p = 1\n",
+ " \n",
+ " )\n",
+ "\n",
+ " try:\n",
+ " out = eval(response.content[0].text)\n",
+ "\n",
+ " json_output[i] = out \n",
+ " print(f\"{imageList[i]} added to json\")\n",
+ " except:\n",
+ " text_output[i] = response.content[0].text\n",
+ " print(f\"{imageList[i]} added to text\")\n",
+ " \n",
+ "json_backup = json_output.copy()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "a308436b-d478-48e9-b476-a8c50878f912",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total no of samples : 180\n",
+ "Total no of samples without images: 160\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " image | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " சரியற்ற கூற்றை கண்டறிக. | \n",
+ " [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11 | \n",
+ " உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... | \n",
+ " [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 12 | \n",
+ " அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... | \n",
+ " [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13 | \n",
+ " வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? | \n",
+ " [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 14 | \n",
+ " S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... | \n",
+ " [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original_question_num question \\\n",
+ "0 10 சரியற்ற கூற்றை கண்டறிக. \n",
+ "1 11 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n",
+ "2 12 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n",
+ "3 13 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n",
+ "4 14 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n",
+ "\n",
+ " options image \n",
+ "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... False \n",
+ "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... False \n",
+ "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... False \n",
+ "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... False \n",
+ "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... False "
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output = []\n",
+ "for key in json_output.keys():\n",
+ " output += json_output[key]\n",
+ "\n",
+ "\n",
+ "df = pd.DataFrame(output)\n",
+ "df.columns = ['original_question_num','question','options','image']\n",
+ "\n",
+ "print(f\"Total no of samples : {len(df)}\")\n",
+ "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea",
+ "metadata": {},
+ "source": [
+ "#### Removing samples with images/tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "2c118ef0-1eda-453a-ab8a-2db286928528",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[df.image == False]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5066c956-a46b-44a7-973f-ba3589a5a028",
+ "metadata": {},
+ "source": [
+ "#### Attaching Answer Key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(answer_file) as f:\n",
+ " answer_key = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n",
+ "\n",
+ "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32a86338-be74-4a89-bb70-2821910f43ee",
+ "metadata": {},
+ "source": [
+ "#### Assigning categories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "category_map = {'biology': 'உயிரியல்' ,'chemistry':'வேதியியல்','physics':'இயற்பியல்'}\n",
+ "\n",
+ "question_category_map = {}\n",
+ "for i in range(1,91):\n",
+ " question_category_map[i] = 'biology'\n",
+ "\n",
+ "for i in range(91,136):\n",
+ " question_category_map[i] = 'physics'\n",
+ "\n",
+ "for i in range(136,181):\n",
+ " question_category_map[i] = 'chemistry'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "bda39801-e316-48ee-9c68-c7178935cd59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n",
+ "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd7aaff4-1f19-47fd-a52b-105870444262",
+ "metadata": {},
+ "source": [
+ "#### Assigning other metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['language'] = lang_code\n",
+ "df['country'] = country\n",
+ "df['file_name'] = file_name\n",
+ "df['source'] = source\n",
+ "df['license'] = license\n",
+ "df['level'] = level\n",
+ "\n",
+ "\n",
+ "df_ = df.copy()\n",
+ "df.drop('image', axis = 1, inplace = True)\n",
+ "\n",
+ "cols = ['language',\n",
+ " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n",
+ " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n",
+ "\n",
+ "df = df[cols]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " language | \n",
+ " country | \n",
+ " file_name | \n",
+ " source | \n",
+ " license | \n",
+ " level | \n",
+ " category_en | \n",
+ " category_original_lang | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " answer | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 10 | \n",
+ " சரியற்ற கூற்றை கண்டறிக. | \n",
+ " [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 11 | \n",
+ " உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... | \n",
+ " [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 12 | \n",
+ " அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... | \n",
+ " [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 13 | \n",
+ " வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? | \n",
+ " [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 14 | \n",
+ " S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... | \n",
+ " [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " language country file_name \\\n",
+ "0 ta India Paper_20201106090359.pdf \n",
+ "1 ta India Paper_20201106090359.pdf \n",
+ "2 ta India Paper_20201106090359.pdf \n",
+ "3 ta India Paper_20201106090359.pdf \n",
+ "4 ta India Paper_20201106090359.pdf \n",
+ "\n",
+ " source license level \\\n",
+ "0 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "\n",
+ " category_en category_original_lang original_question_num \\\n",
+ "0 biology உயிரியல் 10 \n",
+ "1 biology உயிரியல் 11 \n",
+ "2 biology உயிரியல் 12 \n",
+ "3 biology உயிரியல் 13 \n",
+ "4 biology உயிரியல் 14 \n",
+ "\n",
+ " question \\\n",
+ "0 சரியற்ற கூற்றை கண்டறிக. \n",
+ "1 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n",
+ "2 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n",
+ "3 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n",
+ "4 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n",
+ "\n",
+ " options answer \n",
+ "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... 2 \n",
+ "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... 1 \n",
+ "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... 1 \n",
+ "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... 4 \n",
+ "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... 4 "
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afed88ff-1bb6-4157-934e-e72a40538a1a",
+ "metadata": {},
+ "source": [
+ "#### Save processed file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_json(output_file_name+'.json', orient='records')\n",
+ "df_.to_csv(output_file_name+'.tsv',sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "160"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('/')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "470"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "160+161+149"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a98c8472-e0a2-487f-bab5-89eb7c80af0a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/misc/NEET/Readme.md b/misc/NEET/Readme.md
new file mode 100644
index 0000000..4c31a51
--- /dev/null
+++ b/misc/NEET/Readme.md
@@ -0,0 +1,13 @@
+Contains code for processing NEET question papers
+
+Source : https://www.nta.ac.in/
+
+**Files**
+- Generator_Template.ipynb : Basic template for pipeline to process a NEET question paper
+- generator_codes : Contains examples of previous uses of the Generator_Template notebook for specific question papers
+
+
+
+**For generating answer key JSON files**
+Answer key maps can be easily extracted from Claude by passing either the pdf or an image of the answer key along with the following prompt:
+The following is an answer key for an exam. The pdf consists of question number followed by correct option (which is a number between 1 and 4). These are spread across 4 columns. Process the answer key and output it in a JSON file with question number as the key and correct answer as the value
\ No newline at end of file
diff --git a/misc/NEET/generator_codes/Generator3_2020_Gujarati.ipynb b/misc/NEET/generator_codes/Generator3_2020_Gujarati.ipynb
new file mode 100644
index 0000000..aa320bc
--- /dev/null
+++ b/misc/NEET/generator_codes/Generator3_2020_Gujarati.ipynb
@@ -0,0 +1,1029 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/Users/roshansk/Documents/GitHub/Global_Exams'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import re\n",
+ "import base64\n",
+ "from openai import OpenAI\n",
+ "from anthropic import Anthropic\n",
+ "from pdf2image import convert_from_path\n",
+ "\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "\n",
+ "import os\n",
+ "os.getcwd()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "320b7be4-6a54-4169-8666-0a2c56359109",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def encode_image(image_path):\n",
+ " \"\"\"\n",
+ " Function to encode the image\n",
+ " \"\"\"\n",
+ " with open(image_path, \"rb\") as image_file:\n",
+ " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+ "\n",
+ "\n",
+ "def get_message_list(base64_string, prompt):\n",
+ "\n",
+ " message_list = [\n",
+ " {\n",
+ " \"role\": 'user',\n",
+ " \"content\": [\n",
+ " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n",
+ " {\"type\": \"text\", \"text\": prompt}\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ "\n",
+ " return message_list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from anthropic import Anthropic\n",
+ "\n",
+ "client = Anthropic()\n",
+ "\n",
+ "# MODEL_NAME = \"claude-3-opus-20240229\"\n",
+ "\n",
+ "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "language = 'hindi'\n",
+ "\n",
+ "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Gujarati_F6.pdf'\n",
+ "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_F6_Answer_Key.json'\n",
+ "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Gujarati_F6'\n",
+ "\n",
+ "\n",
+ "lang_code = 'gu'\n",
+ "country = 'India'\n",
+ "file_name = 'Paper_20201106083723.pdf'\n",
+ "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106083723.pdf'\n",
+ "license = 'open'\n",
+ "level = 'University'\n",
+ "\n",
+ "output_file_name = 'NEET_2020_Gujarati_processed'\n",
+ "\n",
+ "pages_to_include=[26,45]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "# \"\"\"\n",
+ "\n",
+ "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Gujarati.\n",
+ "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "The json value should have the following keys : \n",
+ "- number : The number of the question\n",
+ "- question : The actual text of the question\n",
+ "- options : A list containing all 4 options for the question\n",
+ "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7",
+ "metadata": {},
+ "source": [
+ "## Creating Image Files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(image_folder):\n",
+ " os.makedirs(image_folder)\n",
+ "\n",
+ "pages = convert_from_path(pdf_file)\n",
+ "\n",
+ "start_page, end_page = pages_to_include \n",
+ "count = 0\n",
+ "for i in range(start_page-1, end_page):\n",
+ " filename = \"page_\"+str(i)+'.jpg'\n",
+ " pages[i].save(Path(image_folder) / filename)\n",
+ " count += 1\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"{count} images created\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dcbe3632-fee8-473c-a206-c417f638484b",
+ "metadata": {},
+ "source": [
+ "## Extracting questions from Claude"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "1b4d55cc-c997-4614-8405-9b48154f8c80",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 5%|██▏ | 1/20 [00:29<09:23, 29.63s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_44.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 10%|████▍ | 2/20 [00:55<08:15, 27.52s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_40.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 15%|██████▌ | 3/20 [01:16<06:53, 24.32s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_41.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 20%|████████▊ | 4/20 [01:41<06:36, 24.77s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_43.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 25%|███████████ | 5/20 [02:12<06:45, 27.06s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_42.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 30%|█████████████▏ | 6/20 [02:43<06:37, 28.39s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_31.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 35%|███████████████▍ | 7/20 [03:01<05:23, 24.88s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_25.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 40%|█████████████████▌ | 8/20 [03:31<05:18, 26.56s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_30.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 45%|███████████████████▊ | 9/20 [03:51<04:28, 24.39s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_26.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 50%|█████████████████████▌ | 10/20 [04:22<04:25, 26.50s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_32.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 55%|███████████████████████▋ | 11/20 [04:45<03:48, 25.36s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_33.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 60%|█████████████████████████▊ | 12/20 [05:18<03:43, 27.93s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_27.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 65%|███████████████████████████▉ | 13/20 [05:39<03:00, 25.75s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_37.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 70%|██████████████████████████████ | 14/20 [06:13<02:49, 28.32s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_36.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 75%|████████████████████████████████▎ | 15/20 [06:41<02:20, 28.01s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_34.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 80%|██████████████████████████████████▍ | 16/20 [07:13<01:57, 29.31s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_35.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 85%|████████████████████████████████████▌ | 17/20 [07:42<01:27, 29.25s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_38.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 90%|██████████████████████████████████████▋ | 18/20 [08:10<00:57, 28.73s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_39.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 95%|████████████████████████████████████████▊ | 19/20 [08:40<00:29, 29.05s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_29.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|███████████████████████████████████████████| 20/20 [08:57<00:00, 26.90s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_28.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "imageList = os.listdir(image_folder)\n",
+ "\n",
+ "json_output = {}\n",
+ "text_output = {}\n",
+ "\n",
+ "for i in tqdm(range(len(imageList))):\n",
+ " image_path = Path(image_folder) / imageList[i]\n",
+ " base64_string = encode_image(image_path)\n",
+ "\n",
+ " message_list = get_message_list(base64_string, prompt) \n",
+ "\n",
+ " response = client.messages.create(\n",
+ " model=MODEL_NAME,\n",
+ " max_tokens=4096,\n",
+ " messages=message_list,\n",
+ " temperature = 0.0,\n",
+ " top_p = 1\n",
+ " \n",
+ " )\n",
+ "\n",
+ " try:\n",
+ " out = eval(response.content[0].text)\n",
+ "\n",
+ " json_output[i] = out \n",
+ " print(f\"{imageList[i]} added to json\")\n",
+ " except:\n",
+ " text_output[i] = response.content[0].text\n",
+ " print(f\"{imageList[i]} added to text\")\n",
+ " \n",
+ "json_backup = json_output.copy()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "a308436b-d478-48e9-b476-a8c50878f912",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total no of samples : 180\n",
+ "Total no of samples without images: 161\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " image | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 170 | \n",
+ " એક અવરોધક માટે વર્ણ-સંકેત નીચે આપેલ છે : | \n",
+ " [4.7 kΩ, 5%, 470 Ω, 5%, 470 kΩ, 5%, 47 kΩ, 10%] | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 171 | \n",
+ " r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને... | \n",
+ " [10.0 g, 20.0 g, 2.5 g, 5.0 g] | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 172 | \n",
+ " એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ... | \n",
+ " [0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³] | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 173 | \n",
+ " અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ... | \n",
+ " [ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 174 | \n",
+ " એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___... | \n",
+ " [5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT] | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original_question_num question \\\n",
+ "0 170 એક અવરોધક માટે વર્ણ-સંકેત નીચે આપેલ છે : \n",
+ "1 171 r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને... \n",
+ "2 172 એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ... \n",
+ "3 173 અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ... \n",
+ "4 174 એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___... \n",
+ "\n",
+ " options image \n",
+ "0 [4.7 kΩ, 5%, 470 Ω, 5%, 470 kΩ, 5%, 47 kΩ, 10%] True \n",
+ "1 [10.0 g, 20.0 g, 2.5 g, 5.0 g] False \n",
+ "2 [0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³] False \n",
+ "3 [ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ... False \n",
+ "4 [5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT] False "
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output = []\n",
+ "for key in json_output.keys():\n",
+ " output += json_output[key]\n",
+ "\n",
+ "\n",
+ "df = pd.DataFrame(output)\n",
+ "df.columns = ['original_question_num','question','options','image']\n",
+ "\n",
+ "print(f\"Total no of samples : {len(df)}\")\n",
+ "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea",
+ "metadata": {},
+ "source": [
+ "#### Removing samples with images/tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "2c118ef0-1eda-453a-ab8a-2db286928528",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[df.image == False]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5066c956-a46b-44a7-973f-ba3589a5a028",
+ "metadata": {},
+ "source": [
+ "#### Attaching Answer Key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(answer_file) as f:\n",
+ " answer_key = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n",
+ "\n",
+ "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32a86338-be74-4a89-bb70-2821910f43ee",
+ "metadata": {},
+ "source": [
+ "#### Assigning categories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "category_map = {'biology': 'जीवविज्ञान' ,'chemistry':'रसायन विज्ञान','physics':'भौतिक विज्ञान'}\n",
+ "\n",
+ "question_category_map = {}\n",
+ "for i in range(1,46):\n",
+ " question_category_map[i] = 'physics'\n",
+ "\n",
+ "for i in range(46,91):\n",
+ " question_category_map[i] = 'chemistry'\n",
+ "\n",
+ "for i in range(91,181):\n",
+ " question_category_map[i] = 'biology'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "bda39801-e316-48ee-9c68-c7178935cd59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n",
+ "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd7aaff4-1f19-47fd-a52b-105870444262",
+ "metadata": {},
+ "source": [
+ "#### Assigning other metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['language'] = lang_code\n",
+ "df['country'] = country\n",
+ "df['file_name'] = file_name\n",
+ "df['source'] = source\n",
+ "df['license'] = license\n",
+ "df['level'] = level\n",
+ "\n",
+ "\n",
+ "df_ = df.copy()\n",
+ "df.drop('image', axis = 1, inplace = True)\n",
+ "\n",
+ "cols = ['language',\n",
+ " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n",
+ " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n",
+ "\n",
+ "df = df[cols]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " language | \n",
+ " country | \n",
+ " file_name | \n",
+ " source | \n",
+ " license | \n",
+ " level | \n",
+ " category_en | \n",
+ " category_original_lang | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " answer | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " gu | \n",
+ " India | \n",
+ " Paper_20201106083723.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवविज्ञान | \n",
+ " 171 | \n",
+ " r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને... | \n",
+ " [10.0 g, 20.0 g, 2.5 g, 5.0 g] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " gu | \n",
+ " India | \n",
+ " Paper_20201106083723.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवविज्ञान | \n",
+ " 172 | \n",
+ " એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ... | \n",
+ " [0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³] | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " gu | \n",
+ " India | \n",
+ " Paper_20201106083723.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवविज्ञान | \n",
+ " 173 | \n",
+ " અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ... | \n",
+ " [ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ... | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " gu | \n",
+ " India | \n",
+ " Paper_20201106083723.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवविज्ञान | \n",
+ " 174 | \n",
+ " એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___... | \n",
+ " [5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT] | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " gu | \n",
+ " India | \n",
+ " Paper_20201106083723.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवविज्ञान | \n",
+ " 175 | \n",
+ " 20 cm² ક્ષેત્રફળ ધરાવતી એક અખરાવતિત સપાટી પર 2... | \n",
+ " [24×10³ J, 48×10³ J, 10×10³ J, 12×10³ J] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " language country file_name \\\n",
+ "1 gu India Paper_20201106083723.pdf \n",
+ "2 gu India Paper_20201106083723.pdf \n",
+ "3 gu India Paper_20201106083723.pdf \n",
+ "4 gu India Paper_20201106083723.pdf \n",
+ "5 gu India Paper_20201106083723.pdf \n",
+ "\n",
+ " source license level \\\n",
+ "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "5 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "\n",
+ " category_en category_original_lang original_question_num \\\n",
+ "1 biology जीवविज्ञान 171 \n",
+ "2 biology जीवविज्ञान 172 \n",
+ "3 biology जीवविज्ञान 173 \n",
+ "4 biology जीवविज्ञान 174 \n",
+ "5 biology जीवविज्ञान 175 \n",
+ "\n",
+ " question \\\n",
+ "1 r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને... \n",
+ "2 એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ... \n",
+ "3 અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ... \n",
+ "4 એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___... \n",
+ "5 20 cm² ક્ષેત્રફળ ધરાવતી એક અખરાવતિત સપાટી પર 2... \n",
+ "\n",
+ " options answer \n",
+ "1 [10.0 g, 20.0 g, 2.5 g, 5.0 g] 1 \n",
+ "2 [0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³] 4 \n",
+ "3 [ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ... 2 \n",
+ "4 [5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT] 4 \n",
+ "5 [24×10³ J, 48×10³ J, 10×10³ J, 12×10³ J] 1 "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afed88ff-1bb6-4157-934e-e72a40538a1a",
+ "metadata": {},
+ "source": [
+ "#### Save processed file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_json(output_file_name+'.json', orient='records')\n",
+ "df_.to_csv(output_file_name+'.tsv',sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "161"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/misc/NEET/generator_codes/Generator3_2020_Marathi.ipynb b/misc/NEET/generator_codes/Generator3_2020_Marathi.ipynb
new file mode 100644
index 0000000..0b2075b
--- /dev/null
+++ b/misc/NEET/generator_codes/Generator3_2020_Marathi.ipynb
@@ -0,0 +1,1046 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/Users/roshansk/Documents/GitHub/Global_Exams'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import re\n",
+ "import base64\n",
+ "from openai import OpenAI\n",
+ "from anthropic import Anthropic\n",
+ "from pdf2image import convert_from_path\n",
+ "\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "\n",
+ "import os\n",
+ "os.getcwd()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "320b7be4-6a54-4169-8666-0a2c56359109",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def encode_image(image_path):\n",
+ " \"\"\"\n",
+ " Function to encode the image\n",
+ " \"\"\"\n",
+ " with open(image_path, \"rb\") as image_file:\n",
+ " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+ "\n",
+ "\n",
+ "def get_message_list(base64_string, prompt):\n",
+ "\n",
+ " message_list = [\n",
+ " {\n",
+ " \"role\": 'user',\n",
+ " \"content\": [\n",
+ " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n",
+ " {\"type\": \"text\", \"text\": prompt}\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ "\n",
+ " return message_list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from anthropic import Anthropic\n",
+ "\n",
+ "client = Anthropic()\n",
+ "\n",
+ "# MODEL_NAME = \"claude-3-opus-20240229\"\n",
+ "\n",
+ "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Marathi_G3.pdf'\n",
+ "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_G3_Answer_Key.json'\n",
+ "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Marathi_G3'\n",
+ "\n",
+ "\n",
+ "lang_code = 'mr'\n",
+ "country = 'India'\n",
+ "file_name = 'Paper_20201106084438.pdf'\n",
+ "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106084438.pdf'\n",
+ "license = 'open'\n",
+ "level = 'University'\n",
+ "\n",
+ "output_file_name = 'NEET_2020_Marathi_processed'\n",
+ "\n",
+ "pages_to_include=[2,21]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b26bc26-49bf-4687-a5a9-03a6832ecc18",
+ "metadata": {},
+ "source": [
+ "## Prompt (User input required. Change the prompt value to account for language)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "# \"\"\"\n",
+ "\n",
+ "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Marathi.\n",
+ "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "The json value should have the following keys : \n",
+ "- number : The number of the question\n",
+ "- question : The actual text of the question\n",
+ "- options : A list containing all 4 options for the question\n",
+ "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7",
+ "metadata": {},
+ "source": [
+ "## Creating Image Files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(image_folder):\n",
+ " os.makedirs(image_folder)\n",
+ "\n",
+ "pages = convert_from_path(pdf_file)\n",
+ "\n",
+ "start_page, end_page = pages_to_include \n",
+ "count = 0\n",
+ "for i in range(start_page-1, end_page):\n",
+ " filename = \"page_\"+str(i)+'.jpg'\n",
+ " pages[i].save(Path(image_folder) / filename)\n",
+ " count += 1\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20 images created\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"{count} images created\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dcbe3632-fee8-473c-a206-c417f638484b",
+ "metadata": {},
+ "source": [
+ "## Extracting questions from Claude"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "1b4d55cc-c997-4614-8405-9b48154f8c80",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 5%|██▏ | 1/20 [00:31<09:49, 31.00s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_2.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 10%|████▍ | 2/20 [00:57<08:35, 28.63s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_3.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 15%|██████▌ | 3/20 [01:24<07:50, 27.68s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_1.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 20%|████████▊ | 4/20 [01:55<07:41, 28.83s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_4.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 25%|███████████ | 5/20 [02:43<08:56, 35.74s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_5.jpg added to text\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 30%|█████████████▏ | 6/20 [03:04<07:11, 30.80s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_7.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 35%|███████████████▍ | 7/20 [03:30<06:21, 29.33s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_6.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 40%|█████████████████▌ | 8/20 [03:56<05:38, 28.18s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_19.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 45%|███████████████████▊ | 9/20 [04:21<04:59, 27.25s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_18.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 50%|█████████████████████▌ | 10/20 [04:29<03:32, 21.27s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_20.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 55%|███████████████████████▋ | 11/20 [04:50<03:11, 21.24s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_10.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 60%|█████████████████████████▊ | 12/20 [09:07<12:23, 92.89s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_11.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 65%|███████████████████████████▉ | 13/20 [09:37<08:37, 73.95s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_13.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 70%|██████████████████████████████ | 14/20 [09:59<05:48, 58.14s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_12.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 75%|████████████████████████████████▎ | 15/20 [10:30<04:09, 49.99s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_16.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 80%|██████████████████████████████████▍ | 16/20 [10:50<02:43, 40.86s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_17.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 85%|████████████████████████████████████▌ | 17/20 [11:11<01:44, 34.98s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_15.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 90%|██████████████████████████████████████▋ | 18/20 [11:36<01:03, 31.86s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_14.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 95%|████████████████████████████████████████▊ | 19/20 [11:53<00:27, 27.59s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_8.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|███████████████████████████████████████████| 20/20 [12:12<00:00, 36.60s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_9.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "imageList = os.listdir(image_folder)\n",
+ "\n",
+ "json_output = {}\n",
+ "text_output = {}\n",
+ "\n",
+ "for i in tqdm(range(len(imageList))):\n",
+ " image_path = Path(image_folder) / imageList[i]\n",
+ " base64_string = encode_image(image_path)\n",
+ "\n",
+ " message_list = get_message_list(base64_string, prompt) \n",
+ "\n",
+ " response = client.messages.create(\n",
+ " model=MODEL_NAME,\n",
+ " max_tokens=4096,\n",
+ " messages=message_list,\n",
+ " temperature = 0.0,\n",
+ " top_p = 1\n",
+ " \n",
+ " )\n",
+ "\n",
+ " try:\n",
+ " out = eval(response.content[0].text)\n",
+ "\n",
+ " json_output[i] = out \n",
+ " print(f\"{imageList[i]} added to json\")\n",
+ " except:\n",
+ " text_output[i] = response.content[0].text\n",
+ " print(f\"{imageList[i]} added to text\")\n",
+ " \n",
+ "json_backup = json_output.copy()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "a308436b-d478-48e9-b476-a8c50878f912",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total no of samples : 171\n",
+ "Total no of samples without images: 149\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " image | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक... | \n",
+ " [परागकोशातील परागकण, अंकुरित झालेल्या परागकण व... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11 | \n",
+ " प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण... | \n",
+ " [स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 12 | \n",
+ " अयोग्य विधान ओळखा : | \n",
+ " [रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13 | \n",
+ " पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____... | \n",
+ " [समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 14 | \n",
+ " जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी... | \n",
+ " [एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original_question_num question \\\n",
+ "0 10 वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक... \n",
+ "1 11 प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण... \n",
+ "2 12 अयोग्य विधान ओळखा : \n",
+ "3 13 पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____... \n",
+ "4 14 जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी... \n",
+ "\n",
+ " options image \n",
+ "0 [परागकोशातील परागकण, अंकुरित झालेल्या परागकण व... False \n",
+ "1 [स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट... False \n",
+ "2 [रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां... False \n",
+ "3 [समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि... False \n",
+ "4 [एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत... False "
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output = []\n",
+ "for key in json_output.keys():\n",
+ " output += json_output[key]\n",
+ "\n",
+ "\n",
+ "df = pd.DataFrame(output)\n",
+ "df.columns = ['original_question_num','question','options','image']\n",
+ "\n",
+ "print(f\"Total no of samples : {len(df)}\")\n",
+ "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea",
+ "metadata": {},
+ "source": [
+ "#### Removing samples with images/tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "2c118ef0-1eda-453a-ab8a-2db286928528",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[df.image == False]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5066c956-a46b-44a7-973f-ba3589a5a028",
+ "metadata": {},
+ "source": [
+ "#### Attaching Answer Key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(answer_file) as f:\n",
+ " answer_key = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n",
+ "\n",
+ "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32a86338-be74-4a89-bb70-2821910f43ee",
+ "metadata": {},
+ "source": [
+ "#### Assigning categories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "category_map = {'biology': 'जीवशास्त्र' ,'chemistry':'रसायनशास्त्र','physics':'भौतिकशास्त्र'}\n",
+ "\n",
+ "question_category_map = {}\n",
+ "for i in range(1,91):\n",
+ " question_category_map[i] = 'biology'\n",
+ "\n",
+ "for i in range(91,136):\n",
+ " question_category_map[i] = 'physics'\n",
+ "\n",
+ "for i in range(136,181):\n",
+ " question_category_map[i] = 'chemistry'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "bda39801-e316-48ee-9c68-c7178935cd59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n",
+ "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd7aaff4-1f19-47fd-a52b-105870444262",
+ "metadata": {},
+ "source": [
+ "#### Assigning other metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['language'] = lang_code\n",
+ "df['country'] = country\n",
+ "df['file_name'] = file_name\n",
+ "df['source'] = source\n",
+ "df['license'] = license\n",
+ "df['level'] = level\n",
+ "\n",
+ "\n",
+ "df_ = df.copy()\n",
+ "df.drop('image', axis = 1, inplace = True)\n",
+ "\n",
+ "cols = ['language',\n",
+ " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n",
+ " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n",
+ "\n",
+ "df = df[cols]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " language | \n",
+ " country | \n",
+ " file_name | \n",
+ " source | \n",
+ " license | \n",
+ " level | \n",
+ " category_en | \n",
+ " category_original_lang | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " answer | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " mr | \n",
+ " India | \n",
+ " Paper_20201106084438.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवशास्त्र | \n",
+ " 10 | \n",
+ " वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक... | \n",
+ " [परागकोशातील परागकण, अंकुरित झालेल्या परागकण व... | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " mr | \n",
+ " India | \n",
+ " Paper_20201106084438.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवशास्त्र | \n",
+ " 11 | \n",
+ " प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण... | \n",
+ " [स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " mr | \n",
+ " India | \n",
+ " Paper_20201106084438.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवशास्त्र | \n",
+ " 12 | \n",
+ " अयोग्य विधान ओळखा : | \n",
+ " [रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां... | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " mr | \n",
+ " India | \n",
+ " Paper_20201106084438.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवशास्त्र | \n",
+ " 13 | \n",
+ " पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____... | \n",
+ " [समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " mr | \n",
+ " India | \n",
+ " Paper_20201106084438.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " जीवशास्त्र | \n",
+ " 14 | \n",
+ " जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी... | \n",
+ " [एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत... | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " language country file_name \\\n",
+ "0 mr India Paper_20201106084438.pdf \n",
+ "1 mr India Paper_20201106084438.pdf \n",
+ "2 mr India Paper_20201106084438.pdf \n",
+ "3 mr India Paper_20201106084438.pdf \n",
+ "4 mr India Paper_20201106084438.pdf \n",
+ "\n",
+ " source license level \\\n",
+ "0 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "\n",
+ " category_en category_original_lang original_question_num \\\n",
+ "0 biology जीवशास्त्र 10 \n",
+ "1 biology जीवशास्त्र 11 \n",
+ "2 biology जीवशास्त्र 12 \n",
+ "3 biology जीवशास्त्र 13 \n",
+ "4 biology जीवशास्त्र 14 \n",
+ "\n",
+ " question \\\n",
+ "0 वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक... \n",
+ "1 प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण... \n",
+ "2 अयोग्य विधान ओळखा : \n",
+ "3 पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____... \n",
+ "4 जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी... \n",
+ "\n",
+ " options answer \n",
+ "0 [परागकोशातील परागकण, अंकुरित झालेल्या परागकण व... 3 \n",
+ "1 [स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट... 1 \n",
+ "2 [रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां... 2 \n",
+ "3 [समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि... 1 \n",
+ "4 [एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत... 2 "
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afed88ff-1bb6-4157-934e-e72a40538a1a",
+ "metadata": {},
+ "source": [
+ "#### Save processed file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_json(output_file_name+'.json', orient='records')\n",
+ "df_.to_csv(output_file_name+'.tsv',sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "161"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/misc/NEET/generator_codes/Generator3_2020_Tamil.ipynb b/misc/NEET/generator_codes/Generator3_2020_Tamil.ipynb
new file mode 100644
index 0000000..5226b40
--- /dev/null
+++ b/misc/NEET/generator_codes/Generator3_2020_Tamil.ipynb
@@ -0,0 +1,1074 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/Users/roshansk/Documents/GitHub/Global_Exams'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import re\n",
+ "import base64\n",
+ "from openai import OpenAI\n",
+ "from anthropic import Anthropic\n",
+ "from pdf2image import convert_from_path\n",
+ "\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "\n",
+ "import os\n",
+ "os.getcwd()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "320b7be4-6a54-4169-8666-0a2c56359109",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def encode_image(image_path):\n",
+ " \"\"\"\n",
+ " Function to encode the image\n",
+ " \"\"\"\n",
+ " with open(image_path, \"rb\") as image_file:\n",
+ " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+ "\n",
+ "\n",
+ "def get_message_list(base64_string, prompt):\n",
+ "\n",
+ " message_list = [\n",
+ " {\n",
+ " \"role\": 'user',\n",
+ " \"content\": [\n",
+ " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n",
+ " {\"type\": \"text\", \"text\": prompt}\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ "\n",
+ " return message_list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from anthropic import Anthropic\n",
+ "\n",
+ "client = Anthropic()\n",
+ "\n",
+ "# MODEL_NAME = \"claude-3-opus-20240229\"\n",
+ "\n",
+ "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Tamil_G4.pdf'\n",
+ "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_G4_Answer_Key.json'\n",
+ "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Tamil_G4'\n",
+ "\n",
+ "\n",
+ "lang_code = 'ta'\n",
+ "country = 'India'\n",
+ "file_name = 'Paper_20201106090359.pdf'\n",
+ "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106090359.pdf'\n",
+ "license = 'open'\n",
+ "level = 'University'\n",
+ "\n",
+ "output_file_name = 'NEET_2020_Tamil_processed'\n",
+ "\n",
+ "pages_to_include=[2,23]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b26bc26-49bf-4687-a5a9-03a6832ecc18",
+ "metadata": {},
+ "source": [
+ "## Prompt (User input required. Change the prompt value to account for language)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "# \"\"\"\n",
+ "\n",
+ "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n",
+ "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n",
+ "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "# The json value should have the following keys : \n",
+ "# - number : The number of the question\n",
+ "# - question : The actual text of the question\n",
+ "# - options : A list containing all 4 options for the question\n",
+ "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n",
+ "# \"\"\"\n",
+ "\n",
+ "\n",
+ "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Tamil.\n",
+ "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n",
+ "The json value should have the following keys : \n",
+ "- number : The number of the question\n",
+ "- question : The actual text of the question\n",
+ "- options : A list containing all 4 options for the question\n",
+ "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n",
+ "\n",
+ "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n",
+ "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n",
+ "\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7",
+ "metadata": {},
+ "source": [
+ "## Creating Image Files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(image_folder):\n",
+ " os.makedirs(image_folder)\n",
+ "\n",
+ "pages = convert_from_path(pdf_file)\n",
+ "\n",
+ "start_page, end_page = pages_to_include \n",
+ "count = 0\n",
+ "for i in range(start_page-1, end_page):\n",
+ " filename = \"page_\"+str(i)+'.jpg'\n",
+ " pages[i].save(Path(image_folder) / filename)\n",
+ " count += 1\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "22 images created\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"{count} images created\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dcbe3632-fee8-473c-a206-c417f638484b",
+ "metadata": {},
+ "source": [
+ "## Extracting questions from Claude"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "1b4d55cc-c997-4614-8405-9b48154f8c80",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 5%|██ | 1/22 [00:31<11:00, 31.45s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_2.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 9%|████ | 2/22 [00:59<09:46, 29.33s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_3.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 14%|██████ | 3/22 [01:21<08:16, 26.14s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_1.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 18%|████████ | 4/22 [01:48<07:55, 26.43s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_4.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 23%|██████████ | 5/22 [02:14<07:25, 26.22s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_5.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 27%|███████████▋ | 6/22 [06:41<28:49, 108.09s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_7.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 32%|██████████████ | 7/22 [06:59<19:40, 78.68s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_6.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 36%|████████████████ | 8/22 [07:26<14:30, 62.17s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_19.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 41%|██████████████████ | 9/22 [07:53<11:06, 51.27s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_18.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 45%|███████████████████▌ | 10/22 [08:04<07:44, 38.67s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_22.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 50%|█████████████████████▌ | 11/22 [08:29<06:19, 34.51s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_20.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 55%|██████████████████████▉ | 12/22 [14:32<22:25, 134.51s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_21.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 59%|████████████████████████▊ | 13/22 [15:04<15:30, 103.41s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_10.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 64%|███████████████████████████▎ | 14/22 [15:27<10:33, 79.20s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_11.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 68%|█████████████████████████████▎ | 15/22 [15:52<07:20, 62.90s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_13.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 73%|███████████████████████████████▎ | 16/22 [16:18<05:11, 51.86s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_12.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 77%|█████████████████████████████████▏ | 17/22 [16:44<03:39, 43.84s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_16.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 82%|██████████████████████████████████▎ | 18/22 [23:51<10:36, 159.11s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_17.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 86%|████████████████████████████████████▎ | 19/22 [24:18<05:58, 119.41s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_15.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 91%|███████████████████████████████████████ | 20/22 [24:46<03:03, 91.92s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_14.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 95%|█████████████████████████████████████████ | 21/22 [25:08<01:10, 70.92s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_8.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|███████████████████████████████████████████| 22/22 [25:31<00:00, 69.61s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "page_9.jpg added to json\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "imageList = os.listdir(image_folder)\n",
+ "\n",
+ "json_output = {}\n",
+ "text_output = {}\n",
+ "\n",
+ "for i in tqdm(range(len(imageList))):\n",
+ " image_path = Path(image_folder) / imageList[i]\n",
+ " base64_string = encode_image(image_path)\n",
+ "\n",
+ " message_list = get_message_list(base64_string, prompt) \n",
+ "\n",
+ " response = client.messages.create(\n",
+ " model=MODEL_NAME,\n",
+ " max_tokens=4096,\n",
+ " messages=message_list,\n",
+ " temperature = 0.0,\n",
+ " top_p = 1\n",
+ " \n",
+ " )\n",
+ "\n",
+ " try:\n",
+ " out = eval(response.content[0].text)\n",
+ "\n",
+ " json_output[i] = out \n",
+ " print(f\"{imageList[i]} added to json\")\n",
+ " except:\n",
+ " text_output[i] = response.content[0].text\n",
+ " print(f\"{imageList[i]} added to text\")\n",
+ " \n",
+ "json_backup = json_output.copy()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "a308436b-d478-48e9-b476-a8c50878f912",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total no of samples : 180\n",
+ "Total no of samples without images: 160\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " image | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " சரியற்ற கூற்றை கண்டறிக. | \n",
+ " [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11 | \n",
+ " உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... | \n",
+ " [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 12 | \n",
+ " அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... | \n",
+ " [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13 | \n",
+ " வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? | \n",
+ " [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 14 | \n",
+ " S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... | \n",
+ " [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original_question_num question \\\n",
+ "0 10 சரியற்ற கூற்றை கண்டறிக. \n",
+ "1 11 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n",
+ "2 12 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n",
+ "3 13 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n",
+ "4 14 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n",
+ "\n",
+ " options image \n",
+ "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... False \n",
+ "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... False \n",
+ "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... False \n",
+ "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... False \n",
+ "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... False "
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output = []\n",
+ "for key in json_output.keys():\n",
+ " output += json_output[key]\n",
+ "\n",
+ "\n",
+ "df = pd.DataFrame(output)\n",
+ "df.columns = ['original_question_num','question','options','image']\n",
+ "\n",
+ "print(f\"Total no of samples : {len(df)}\")\n",
+ "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea",
+ "metadata": {},
+ "source": [
+ "#### Removing samples with images/tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "2c118ef0-1eda-453a-ab8a-2db286928528",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[df.image == False]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5066c956-a46b-44a7-973f-ba3589a5a028",
+ "metadata": {},
+ "source": [
+ "#### Attaching Answer Key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(answer_file) as f:\n",
+ " answer_key = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n",
+ "\n",
+ "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32a86338-be74-4a89-bb70-2821910f43ee",
+ "metadata": {},
+ "source": [
+ "#### Assigning categories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "category_map = {'biology': 'உயிரியல்' ,'chemistry':'வேதியியல்','physics':'இயற்பியல்'}\n",
+ "\n",
+ "question_category_map = {}\n",
+ "for i in range(1,91):\n",
+ " question_category_map[i] = 'biology'\n",
+ "\n",
+ "for i in range(91,136):\n",
+ " question_category_map[i] = 'physics'\n",
+ "\n",
+ "for i in range(136,181):\n",
+ " question_category_map[i] = 'chemistry'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "bda39801-e316-48ee-9c68-c7178935cd59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n",
+ "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd7aaff4-1f19-47fd-a52b-105870444262",
+ "metadata": {},
+ "source": [
+ "#### Assigning other metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['language'] = lang_code\n",
+ "df['country'] = country\n",
+ "df['file_name'] = file_name\n",
+ "df['source'] = source\n",
+ "df['license'] = license\n",
+ "df['level'] = level\n",
+ "\n",
+ "\n",
+ "df_ = df.copy()\n",
+ "df.drop('image', axis = 1, inplace = True)\n",
+ "\n",
+ "cols = ['language',\n",
+ " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n",
+ " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n",
+ "\n",
+ "df = df[cols]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " language | \n",
+ " country | \n",
+ " file_name | \n",
+ " source | \n",
+ " license | \n",
+ " level | \n",
+ " category_en | \n",
+ " category_original_lang | \n",
+ " original_question_num | \n",
+ " question | \n",
+ " options | \n",
+ " answer | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 10 | \n",
+ " சரியற்ற கூற்றை கண்டறிக. | \n",
+ " [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 11 | \n",
+ " உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... | \n",
+ " [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 12 | \n",
+ " அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... | \n",
+ " [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 13 | \n",
+ " வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? | \n",
+ " [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ta | \n",
+ " India | \n",
+ " Paper_20201106090359.pdf | \n",
+ " https://www.nta.ac.in/Download/ExamPaper/Paper... | \n",
+ " open | \n",
+ " University | \n",
+ " biology | \n",
+ " உயிரியல் | \n",
+ " 14 | \n",
+ " S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... | \n",
+ " [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " language country file_name \\\n",
+ "0 ta India Paper_20201106090359.pdf \n",
+ "1 ta India Paper_20201106090359.pdf \n",
+ "2 ta India Paper_20201106090359.pdf \n",
+ "3 ta India Paper_20201106090359.pdf \n",
+ "4 ta India Paper_20201106090359.pdf \n",
+ "\n",
+ " source license level \\\n",
+ "0 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n",
+ "\n",
+ " category_en category_original_lang original_question_num \\\n",
+ "0 biology உயிரியல் 10 \n",
+ "1 biology உயிரியல் 11 \n",
+ "2 biology உயிரியல் 12 \n",
+ "3 biology உயிரியல் 13 \n",
+ "4 biology உயிரியல் 14 \n",
+ "\n",
+ " question \\\n",
+ "0 சரியற்ற கூற்றை கண்டறிக. \n",
+ "1 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n",
+ "2 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n",
+ "3 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n",
+ "4 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n",
+ "\n",
+ " options answer \n",
+ "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... 2 \n",
+ "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... 1 \n",
+ "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... 1 \n",
+ "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... 4 \n",
+ "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... 4 "
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afed88ff-1bb6-4157-934e-e72a40538a1a",
+ "metadata": {},
+ "source": [
+ "#### Save processed file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_json(output_file_name+'.json', orient='records')\n",
+ "df_.to_csv(output_file_name+'.tsv',sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "160"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}