diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..a32742a Binary files /dev/null and b/.DS_Store differ diff --git a/misc/.DS_Store b/misc/.DS_Store new file mode 100644 index 0000000..a7f5ee5 Binary files /dev/null and b/misc/.DS_Store differ diff --git a/misc/NEET/.DS_Store b/misc/NEET/.DS_Store new file mode 100644 index 0000000..f894b1c Binary files /dev/null and b/misc/NEET/.DS_Store differ diff --git a/misc/NEET/Generator_Template.ipynb b/misc/NEET/Generator_Template.ipynb new file mode 100644 index 0000000..f0182c7 --- /dev/null +++ b/misc/NEET/Generator_Template.ipynb @@ -0,0 +1,1097 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/roshansk/Documents/GitHub/Global_Exams'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import base64\n", + "from openai import OpenAI\n", + "from anthropic import Anthropic\n", + "from pdf2image import convert_from_path\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "import json\n", + "\n", + "import os\n", + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320b7be4-6a54-4169-8666-0a2c56359109", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " \"\"\"\n", + " Function to encode the image\n", + " \"\"\"\n", + " with open(image_path, \"rb\") as image_file:\n", + " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + "\n", + "\n", + "def get_message_list(base64_string, prompt):\n", + "\n", + " message_list = [\n", + " {\n", + " \"role\": 'user',\n", + " \"content\": [\n", + " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n", + " {\"type\": \"text\", \"text\": prompt}\n", + " ]\n", + " }\n", + " ]\n", + "\n", + " return message_list" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae", + "metadata": {}, + "outputs": [], + "source": [ + "from anthropic import Anthropic\n", + "\n", + "client = Anthropic()\n", + "\n", + "# MODEL_NAME = \"claude-3-opus-20240229\"\n", + "\n", + "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Tamil_G4.pdf'\n", + "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_G4_Answer_Key.json'\n", + "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Tamil_G4'\n", + "\n", + "\n", + "lang_code = 'ta'\n", + "country = 'India'\n", + "file_name = 'Paper_20201106090359.pdf'\n", + "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106090359.pdf'\n", + "license = 'open'\n", + "level = 'University'\n", + "\n", + "output_file_name = 'NEET_2020_Tamil_processed'\n", + "\n", + "pages_to_include=[2,23]" + ] + }, + { + "cell_type": "markdown", + "id": "3b26bc26-49bf-4687-a5a9-03a6832ecc18", + "metadata": {}, + "source": [ + "## Prompt (User input required. Change the prompt value to account for language)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981", + "metadata": {}, + "outputs": [], + "source": [ + "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "# \"\"\"\n", + "\n", + "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n", + "# \"\"\"\n", + "\n", + "\n", + "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n", + "# \"\"\"\n", + "\n", + "\n", + "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Tamil.\n", + "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "The json value should have the following keys : \n", + "- number : The number of the question\n", + "- question : The actual text of the question\n", + "- options : A list containing all 4 options for the question\n", + "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7", + "metadata": {}, + "source": [ + "## Creating Image Files" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(image_folder):\n", + " os.makedirs(image_folder)\n", + "\n", + "pages = convert_from_path(pdf_file)\n", + "\n", + "start_page, end_page = pages_to_include \n", + "count = 0\n", + "for i in range(start_page-1, end_page):\n", + " filename = \"page_\"+str(i)+'.jpg'\n", + " pages[i].save(Path(image_folder) / filename)\n", + " count += 1\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 images created\n" + ] + } + ], + "source": [ + "print(f\"{count} images created\")" + ] + }, + { + "cell_type": "markdown", + "id": "dcbe3632-fee8-473c-a206-c417f638484b", + "metadata": {}, + "source": [ + "## Extracting questions from Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "1b4d55cc-c997-4614-8405-9b48154f8c80", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|██ | 1/22 [00:31<11:00, 31.45s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_2.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 9%|████ | 2/22 [00:59<09:46, 29.33s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_3.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 14%|██████ | 3/22 [01:21<08:16, 26.14s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_1.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 18%|████████ | 4/22 [01:48<07:55, 26.43s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_4.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 23%|██████████ | 5/22 [02:14<07:25, 26.22s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_5.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 27%|███████████▋ | 6/22 [06:41<28:49, 108.09s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_7.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 32%|██████████████ | 7/22 [06:59<19:40, 78.68s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_6.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 36%|████████████████ | 8/22 [07:26<14:30, 62.17s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_19.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 41%|██████████████████ | 9/22 [07:53<11:06, 51.27s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_18.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 45%|███████████████████▌ | 10/22 [08:04<07:44, 38.67s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_22.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 50%|█████████████████████▌ | 11/22 [08:29<06:19, 34.51s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_20.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 55%|██████████████████████▉ | 12/22 [14:32<22:25, 134.51s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_21.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 59%|████████████████████████▊ | 13/22 [15:04<15:30, 103.41s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_10.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 64%|███████████████████████████▎ | 14/22 [15:27<10:33, 79.20s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_11.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 68%|█████████████████████████████▎ | 15/22 [15:52<07:20, 62.90s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_13.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████████████████████████████▎ | 16/22 [16:18<05:11, 51.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_12.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 77%|█████████████████████████████████▏ | 17/22 [16:44<03:39, 43.84s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_16.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 82%|██████████████████████████████████▎ | 18/22 [23:51<10:36, 159.11s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_17.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 86%|████████████████████████████████████▎ | 19/22 [24:18<05:58, 119.41s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_15.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 91%|███████████████████████████████████████ | 20/22 [24:46<03:03, 91.92s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_14.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 95%|█████████████████████████████████████████ | 21/22 [25:08<01:10, 70.92s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_8.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 22/22 [25:31<00:00, 69.61s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_9.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "imageList = os.listdir(image_folder)\n", + "\n", + "json_output = {}\n", + "text_output = {}\n", + "\n", + "for i in tqdm(range(len(imageList))):\n", + " image_path = Path(image_folder) / imageList[i]\n", + " base64_string = encode_image(image_path)\n", + "\n", + " message_list = get_message_list(base64_string, prompt) \n", + "\n", + " response = client.messages.create(\n", + " model=MODEL_NAME,\n", + " max_tokens=4096,\n", + " messages=message_list,\n", + " temperature = 0.0,\n", + " top_p = 1\n", + " \n", + " )\n", + "\n", + " try:\n", + " out = eval(response.content[0].text)\n", + "\n", + " json_output[i] = out \n", + " print(f\"{imageList[i]} added to json\")\n", + " except:\n", + " text_output[i] = response.content[0].text\n", + " print(f\"{imageList[i]} added to text\")\n", + " \n", + "json_backup = json_output.copy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "a308436b-d478-48e9-b476-a8c50878f912", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total no of samples : 180\n", + "Total no of samples without images: 160\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_question_numquestionoptionsimage
010சரியற்ற கூற்றை கண்டறிக.[சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந...False
111உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர...[துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க...False
212அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா...[அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி...False
313வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ?[ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந...False
414S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ...[800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ...False
\n", + "
" + ], + "text/plain": [ + " original_question_num question \\\n", + "0 10 சரியற்ற கூற்றை கண்டறிக. \n", + "1 11 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n", + "2 12 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n", + "3 13 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n", + "4 14 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n", + "\n", + " options image \n", + "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... False \n", + "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... False \n", + "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... False \n", + "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... False \n", + "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... False " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = []\n", + "for key in json_output.keys():\n", + " output += json_output[key]\n", + "\n", + "\n", + "df = pd.DataFrame(output)\n", + "df.columns = ['original_question_num','question','options','image']\n", + "\n", + "print(f\"Total no of samples : {len(df)}\")\n", + "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea", + "metadata": {}, + "source": [ + "#### Removing samples with images/tables" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "2c118ef0-1eda-453a-ab8a-2db286928528", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df.image == False]" + ] + }, + { + "cell_type": "markdown", + "id": "5066c956-a46b-44a7-973f-ba3589a5a028", + "metadata": {}, + "source": [ + "#### Attaching Answer Key" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9", + "metadata": {}, + "outputs": [], + "source": [ + "with open(answer_file) as f:\n", + " answer_key = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n", + "\n", + "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))" + ] + }, + { + "cell_type": "markdown", + "id": "32a86338-be74-4a89-bb70-2821910f43ee", + "metadata": {}, + "source": [ + "#### Assigning categories" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19", + "metadata": {}, + "outputs": [], + "source": [ + "category_map = {'biology': 'உயிரியல்' ,'chemistry':'வேதியியல்','physics':'இயற்பியல்'}\n", + "\n", + "question_category_map = {}\n", + "for i in range(1,91):\n", + " question_category_map[i] = 'biology'\n", + "\n", + "for i in range(91,136):\n", + " question_category_map[i] = 'physics'\n", + "\n", + "for i in range(136,181):\n", + " question_category_map[i] = 'chemistry'" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "bda39801-e316-48ee-9c68-c7178935cd59", + "metadata": {}, + "outputs": [], + "source": [ + "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n", + "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])" + ] + }, + { + "cell_type": "markdown", + "id": "fd7aaff4-1f19-47fd-a52b-105870444262", + "metadata": {}, + "source": [ + "#### Assigning other metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891", + "metadata": {}, + "outputs": [], + "source": [ + "df['language'] = lang_code\n", + "df['country'] = country\n", + "df['file_name'] = file_name\n", + "df['source'] = source\n", + "df['license'] = license\n", + "df['level'] = level\n", + "\n", + "\n", + "df_ = df.copy()\n", + "df.drop('image', axis = 1, inplace = True)\n", + "\n", + "cols = ['language',\n", + " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n", + " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n", + "\n", + "df = df[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
languagecountryfile_namesourcelicenselevelcategory_encategory_original_langoriginal_question_numquestionoptionsanswer
0taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்10சரியற்ற கூற்றை கண்டறிக.[சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந...2
1taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்11உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர...[துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க...1
2taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்12அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா...[அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி...1
3taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்13வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ?[ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந...4
4taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்14S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ...[800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ...4
\n", + "
" + ], + "text/plain": [ + " language country file_name \\\n", + "0 ta India Paper_20201106090359.pdf \n", + "1 ta India Paper_20201106090359.pdf \n", + "2 ta India Paper_20201106090359.pdf \n", + "3 ta India Paper_20201106090359.pdf \n", + "4 ta India Paper_20201106090359.pdf \n", + "\n", + " source license level \\\n", + "0 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "\n", + " category_en category_original_lang original_question_num \\\n", + "0 biology உயிரியல் 10 \n", + "1 biology உயிரியல் 11 \n", + "2 biology உயிரியல் 12 \n", + "3 biology உயிரியல் 13 \n", + "4 biology உயிரியல் 14 \n", + "\n", + " question \\\n", + "0 சரியற்ற கூற்றை கண்டறிக. \n", + "1 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n", + "2 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n", + "3 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n", + "4 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n", + "\n", + " options answer \n", + "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... 2 \n", + "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... 1 \n", + "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... 1 \n", + "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... 4 \n", + "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... 4 " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "afed88ff-1bb6-4157-934e-e72a40538a1a", + "metadata": {}, + "source": [ + "#### Save processed file" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_json(output_file_name+'.json', orient='records')\n", + "df_.to_csv(output_file_name+'.tsv',sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "160" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93", + "metadata": {}, + "outputs": [], + "source": [ + "with open('/')" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "470" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "160+161+149" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a98c8472-e0a2-487f-bab5-89eb7c80af0a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/misc/NEET/Readme.md b/misc/NEET/Readme.md new file mode 100644 index 0000000..4c31a51 --- /dev/null +++ b/misc/NEET/Readme.md @@ -0,0 +1,13 @@ +Contains code for processing NEET question papers + +Source : https://www.nta.ac.in/ + +**Files** +- Generator_Template.ipynb : Basic template for pipeline to process a NEET question paper +- generator_codes : Contains examples of previous uses of the Generator_Template notebook for specific question papers + + + +**For generating answer key JSON files** +Answer key maps can be easily extracted from Claude by passing either the pdf or an image of the answer key along with the following prompt: +The following is an answer key for an exam. The pdf consists of question number followed by correct option (which is a number between 1 and 4). These are spread across 4 columns. Process the answer key and output it in a JSON file with question number as the key and correct answer as the value \ No newline at end of file diff --git a/misc/NEET/generator_codes/Generator3_2020_Gujarati.ipynb b/misc/NEET/generator_codes/Generator3_2020_Gujarati.ipynb new file mode 100644 index 0000000..aa320bc --- /dev/null +++ b/misc/NEET/generator_codes/Generator3_2020_Gujarati.ipynb @@ -0,0 +1,1029 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/roshansk/Documents/GitHub/Global_Exams'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import base64\n", + "from openai import OpenAI\n", + "from anthropic import Anthropic\n", + "from pdf2image import convert_from_path\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "import json\n", + "\n", + "import os\n", + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320b7be4-6a54-4169-8666-0a2c56359109", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " \"\"\"\n", + " Function to encode the image\n", + " \"\"\"\n", + " with open(image_path, \"rb\") as image_file:\n", + " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + "\n", + "\n", + "def get_message_list(base64_string, prompt):\n", + "\n", + " message_list = [\n", + " {\n", + " \"role\": 'user',\n", + " \"content\": [\n", + " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n", + " {\"type\": \"text\", \"text\": prompt}\n", + " ]\n", + " }\n", + " ]\n", + "\n", + " return message_list" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae", + "metadata": {}, + "outputs": [], + "source": [ + "from anthropic import Anthropic\n", + "\n", + "client = Anthropic()\n", + "\n", + "# MODEL_NAME = \"claude-3-opus-20240229\"\n", + "\n", + "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8", + "metadata": {}, + "outputs": [], + "source": [ + "language = 'hindi'\n", + "\n", + "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Gujarati_F6.pdf'\n", + "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_F6_Answer_Key.json'\n", + "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Gujarati_F6'\n", + "\n", + "\n", + "lang_code = 'gu'\n", + "country = 'India'\n", + "file_name = 'Paper_20201106083723.pdf'\n", + "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106083723.pdf'\n", + "license = 'open'\n", + "level = 'University'\n", + "\n", + "output_file_name = 'NEET_2020_Gujarati_processed'\n", + "\n", + "pages_to_include=[26,45]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981", + "metadata": {}, + "outputs": [], + "source": [ + "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "# \"\"\"\n", + "\n", + "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n", + "# \"\"\"\n", + "\n", + "\n", + "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n", + "# \"\"\"\n", + "\n", + "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Gujarati.\n", + "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "The json value should have the following keys : \n", + "- number : The number of the question\n", + "- question : The actual text of the question\n", + "- options : A list containing all 4 options for the question\n", + "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7", + "metadata": {}, + "source": [ + "## Creating Image Files" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(image_folder):\n", + " os.makedirs(image_folder)\n", + "\n", + "pages = convert_from_path(pdf_file)\n", + "\n", + "start_page, end_page = pages_to_include \n", + "count = 0\n", + "for i in range(start_page-1, end_page):\n", + " filename = \"page_\"+str(i)+'.jpg'\n", + " pages[i].save(Path(image_folder) / filename)\n", + " count += 1\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"{count} images created\")" + ] + }, + { + "cell_type": "markdown", + "id": "dcbe3632-fee8-473c-a206-c417f638484b", + "metadata": {}, + "source": [ + "## Extracting questions from Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1b4d55cc-c997-4614-8405-9b48154f8c80", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|██▏ | 1/20 [00:29<09:23, 29.63s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_44.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 10%|████▍ | 2/20 [00:55<08:15, 27.52s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_40.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 15%|██████▌ | 3/20 [01:16<06:53, 24.32s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_41.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 20%|████████▊ | 4/20 [01:41<06:36, 24.77s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_43.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 25%|███████████ | 5/20 [02:12<06:45, 27.06s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_42.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 30%|█████████████▏ | 6/20 [02:43<06:37, 28.39s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_31.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 35%|███████████████▍ | 7/20 [03:01<05:23, 24.88s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_25.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 40%|█████████████████▌ | 8/20 [03:31<05:18, 26.56s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_30.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 45%|███████████████████▊ | 9/20 [03:51<04:28, 24.39s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_26.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 50%|█████████████████████▌ | 10/20 [04:22<04:25, 26.50s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_32.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 55%|███████████████████████▋ | 11/20 [04:45<03:48, 25.36s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_33.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 60%|█████████████████████████▊ | 12/20 [05:18<03:43, 27.93s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_27.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 65%|███████████████████████████▉ | 13/20 [05:39<03:00, 25.75s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_37.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 70%|██████████████████████████████ | 14/20 [06:13<02:49, 28.32s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_36.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 75%|████████████████████████████████▎ | 15/20 [06:41<02:20, 28.01s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_34.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 80%|██████████████████████████████████▍ | 16/20 [07:13<01:57, 29.31s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_35.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 85%|████████████████████████████████████▌ | 17/20 [07:42<01:27, 29.25s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_38.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 90%|██████████████████████████████████████▋ | 18/20 [08:10<00:57, 28.73s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_39.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 95%|████████████████████████████████████████▊ | 19/20 [08:40<00:29, 29.05s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_29.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 20/20 [08:57<00:00, 26.90s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_28.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "imageList = os.listdir(image_folder)\n", + "\n", + "json_output = {}\n", + "text_output = {}\n", + "\n", + "for i in tqdm(range(len(imageList))):\n", + " image_path = Path(image_folder) / imageList[i]\n", + " base64_string = encode_image(image_path)\n", + "\n", + " message_list = get_message_list(base64_string, prompt) \n", + "\n", + " response = client.messages.create(\n", + " model=MODEL_NAME,\n", + " max_tokens=4096,\n", + " messages=message_list,\n", + " temperature = 0.0,\n", + " top_p = 1\n", + " \n", + " )\n", + "\n", + " try:\n", + " out = eval(response.content[0].text)\n", + "\n", + " json_output[i] = out \n", + " print(f\"{imageList[i]} added to json\")\n", + " except:\n", + " text_output[i] = response.content[0].text\n", + " print(f\"{imageList[i]} added to text\")\n", + " \n", + "json_backup = json_output.copy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "a308436b-d478-48e9-b476-a8c50878f912", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total no of samples : 180\n", + "Total no of samples without images: 161\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_question_numquestionoptionsimage
0170એક અવરોધક માટે વર્ણ-સંકેત નીચે આપેલ છે :[4.7 kΩ, 5%, 470 Ω, 5%, 470 kΩ, 5%, 47 kΩ, 10%]True
1171r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને...[10.0 g, 20.0 g, 2.5 g, 5.0 g]False
2172એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ...[0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³]False
3173અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ...[ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ...False
4174એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___...[5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT]False
\n", + "
" + ], + "text/plain": [ + " original_question_num question \\\n", + "0 170 એક અવરોધક માટે વર્ણ-સંકેત નીચે આપેલ છે : \n", + "1 171 r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને... \n", + "2 172 એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ... \n", + "3 173 અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ... \n", + "4 174 એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___... \n", + "\n", + " options image \n", + "0 [4.7 kΩ, 5%, 470 Ω, 5%, 470 kΩ, 5%, 47 kΩ, 10%] True \n", + "1 [10.0 g, 20.0 g, 2.5 g, 5.0 g] False \n", + "2 [0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³] False \n", + "3 [ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ... False \n", + "4 [5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT] False " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = []\n", + "for key in json_output.keys():\n", + " output += json_output[key]\n", + "\n", + "\n", + "df = pd.DataFrame(output)\n", + "df.columns = ['original_question_num','question','options','image']\n", + "\n", + "print(f\"Total no of samples : {len(df)}\")\n", + "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea", + "metadata": {}, + "source": [ + "#### Removing samples with images/tables" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "2c118ef0-1eda-453a-ab8a-2db286928528", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df.image == False]" + ] + }, + { + "cell_type": "markdown", + "id": "5066c956-a46b-44a7-973f-ba3589a5a028", + "metadata": {}, + "source": [ + "#### Attaching Answer Key" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9", + "metadata": {}, + "outputs": [], + "source": [ + "with open(answer_file) as f:\n", + " answer_key = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n", + "\n", + "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))" + ] + }, + { + "cell_type": "markdown", + "id": "32a86338-be74-4a89-bb70-2821910f43ee", + "metadata": {}, + "source": [ + "#### Assigning categories" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19", + "metadata": {}, + "outputs": [], + "source": [ + "category_map = {'biology': 'जीवविज्ञान' ,'chemistry':'रसायन विज्ञान','physics':'भौतिक विज्ञान'}\n", + "\n", + "question_category_map = {}\n", + "for i in range(1,46):\n", + " question_category_map[i] = 'physics'\n", + "\n", + "for i in range(46,91):\n", + " question_category_map[i] = 'chemistry'\n", + "\n", + "for i in range(91,181):\n", + " question_category_map[i] = 'biology'" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "bda39801-e316-48ee-9c68-c7178935cd59", + "metadata": {}, + "outputs": [], + "source": [ + "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n", + "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])" + ] + }, + { + "cell_type": "markdown", + "id": "fd7aaff4-1f19-47fd-a52b-105870444262", + "metadata": {}, + "source": [ + "#### Assigning other metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891", + "metadata": {}, + "outputs": [], + "source": [ + "df['language'] = lang_code\n", + "df['country'] = country\n", + "df['file_name'] = file_name\n", + "df['source'] = source\n", + "df['license'] = license\n", + "df['level'] = level\n", + "\n", + "\n", + "df_ = df.copy()\n", + "df.drop('image', axis = 1, inplace = True)\n", + "\n", + "cols = ['language',\n", + " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n", + " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n", + "\n", + "df = df[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
languagecountryfile_namesourcelicenselevelcategory_encategory_original_langoriginal_question_numquestionoptionsanswer
1guIndiaPaper_20201106083723.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवविज्ञान171r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને...[10.0 g, 20.0 g, 2.5 g, 5.0 g]1
2guIndiaPaper_20201106083723.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवविज्ञान172એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ...[0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³]4
3guIndiaPaper_20201106083723.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवविज्ञान173અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ...[ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ...2
4guIndiaPaper_20201106083723.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवविज्ञान174એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___...[5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT]4
5guIndiaPaper_20201106083723.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवविज्ञान17520 cm² ક્ષેત્રફળ ધરાવતી એક અખરાવતિત સપાટી પર 2...[24×10³ J, 48×10³ J, 10×10³ J, 12×10³ J]1
\n", + "
" + ], + "text/plain": [ + " language country file_name \\\n", + "1 gu India Paper_20201106083723.pdf \n", + "2 gu India Paper_20201106083723.pdf \n", + "3 gu India Paper_20201106083723.pdf \n", + "4 gu India Paper_20201106083723.pdf \n", + "5 gu India Paper_20201106083723.pdf \n", + "\n", + " source license level \\\n", + "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "5 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "\n", + " category_en category_original_lang original_question_num \\\n", + "1 biology जीवविज्ञान 171 \n", + "2 biology जीवविज्ञान 172 \n", + "3 biology जीवविज्ञान 173 \n", + "4 biology जीवविज्ञान 174 \n", + "5 biology जीवविज्ञान 175 \n", + "\n", + " question \\\n", + "1 r-ત્રિજ્યા ધરાવતી એક કેશનળી ટ્યુબ (કેપિલરી) ને... \n", + "2 એક વાયુકોષમાં 249 kPa દબાણે અને 27°C તાપમાને હ... \n", + "3 અવરોધના ગુણ તાપમાન ગુણાંક ધરાવતા હોય તેવા 'ધન ... \n", + "4 એક એકપરમાણવીય વાયુની સરેરાશ ઉષ્મા ઊર્જા છે ___... \n", + "5 20 cm² ક્ષેત્રફળ ધરાવતી એક અખરાવતિત સપાટી પર 2... \n", + "\n", + " options answer \n", + "1 [10.0 g, 20.0 g, 2.5 g, 5.0 g] 1 \n", + "2 [0.1 kg/m³, 0.02 kg/m³, 0.5 kg/m³, 0.2 kg/m³] 4 \n", + "3 [ફક્ત અર્ધવાહકો, અવાહકો અને અર્ધવાહકો, ધાતુઓ, ... 2 \n", + "4 [5/2 kBT, 7/2 kBT, 1/2 kBT, 3/2 kBT] 4 \n", + "5 [24×10³ J, 48×10³ J, 10×10³ J, 12×10³ J] 1 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "afed88ff-1bb6-4157-934e-e72a40538a1a", + "metadata": {}, + "source": [ + "#### Save processed file" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_json(output_file_name+'.json', orient='records')\n", + "df_.to_csv(output_file_name+'.tsv',sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "161" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/misc/NEET/generator_codes/Generator3_2020_Marathi.ipynb b/misc/NEET/generator_codes/Generator3_2020_Marathi.ipynb new file mode 100644 index 0000000..0b2075b --- /dev/null +++ b/misc/NEET/generator_codes/Generator3_2020_Marathi.ipynb @@ -0,0 +1,1046 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/roshansk/Documents/GitHub/Global_Exams'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import base64\n", + "from openai import OpenAI\n", + "from anthropic import Anthropic\n", + "from pdf2image import convert_from_path\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "import json\n", + "\n", + "import os\n", + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320b7be4-6a54-4169-8666-0a2c56359109", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " \"\"\"\n", + " Function to encode the image\n", + " \"\"\"\n", + " with open(image_path, \"rb\") as image_file:\n", + " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + "\n", + "\n", + "def get_message_list(base64_string, prompt):\n", + "\n", + " message_list = [\n", + " {\n", + " \"role\": 'user',\n", + " \"content\": [\n", + " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n", + " {\"type\": \"text\", \"text\": prompt}\n", + " ]\n", + " }\n", + " ]\n", + "\n", + " return message_list" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae", + "metadata": {}, + "outputs": [], + "source": [ + "from anthropic import Anthropic\n", + "\n", + "client = Anthropic()\n", + "\n", + "# MODEL_NAME = \"claude-3-opus-20240229\"\n", + "\n", + "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Marathi_G3.pdf'\n", + "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_G3_Answer_Key.json'\n", + "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Marathi_G3'\n", + "\n", + "\n", + "lang_code = 'mr'\n", + "country = 'India'\n", + "file_name = 'Paper_20201106084438.pdf'\n", + "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106084438.pdf'\n", + "license = 'open'\n", + "level = 'University'\n", + "\n", + "output_file_name = 'NEET_2020_Marathi_processed'\n", + "\n", + "pages_to_include=[2,21]" + ] + }, + { + "cell_type": "markdown", + "id": "3b26bc26-49bf-4687-a5a9-03a6832ecc18", + "metadata": {}, + "source": [ + "## Prompt (User input required. Change the prompt value to account for language)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981", + "metadata": {}, + "outputs": [], + "source": [ + "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "# \"\"\"\n", + "\n", + "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n", + "# \"\"\"\n", + "\n", + "\n", + "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n", + "# \"\"\"\n", + "\n", + "\n", + "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Marathi.\n", + "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "The json value should have the following keys : \n", + "- number : The number of the question\n", + "- question : The actual text of the question\n", + "- options : A list containing all 4 options for the question\n", + "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7", + "metadata": {}, + "source": [ + "## Creating Image Files" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(image_folder):\n", + " os.makedirs(image_folder)\n", + "\n", + "pages = convert_from_path(pdf_file)\n", + "\n", + "start_page, end_page = pages_to_include \n", + "count = 0\n", + "for i in range(start_page-1, end_page):\n", + " filename = \"page_\"+str(i)+'.jpg'\n", + " pages[i].save(Path(image_folder) / filename)\n", + " count += 1\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20 images created\n" + ] + } + ], + "source": [ + "print(f\"{count} images created\")" + ] + }, + { + "cell_type": "markdown", + "id": "dcbe3632-fee8-473c-a206-c417f638484b", + "metadata": {}, + "source": [ + "## Extracting questions from Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "1b4d55cc-c997-4614-8405-9b48154f8c80", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|██▏ | 1/20 [00:31<09:49, 31.00s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_2.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 10%|████▍ | 2/20 [00:57<08:35, 28.63s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_3.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 15%|██████▌ | 3/20 [01:24<07:50, 27.68s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_1.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 20%|████████▊ | 4/20 [01:55<07:41, 28.83s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_4.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 25%|███████████ | 5/20 [02:43<08:56, 35.74s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_5.jpg added to text\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 30%|█████████████▏ | 6/20 [03:04<07:11, 30.80s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_7.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 35%|███████████████▍ | 7/20 [03:30<06:21, 29.33s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_6.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 40%|█████████████████▌ | 8/20 [03:56<05:38, 28.18s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_19.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 45%|███████████████████▊ | 9/20 [04:21<04:59, 27.25s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_18.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 50%|█████████████████████▌ | 10/20 [04:29<03:32, 21.27s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_20.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 55%|███████████████████████▋ | 11/20 [04:50<03:11, 21.24s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_10.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 60%|█████████████████████████▊ | 12/20 [09:07<12:23, 92.89s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_11.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 65%|███████████████████████████▉ | 13/20 [09:37<08:37, 73.95s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_13.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 70%|██████████████████████████████ | 14/20 [09:59<05:48, 58.14s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_12.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 75%|████████████████████████████████▎ | 15/20 [10:30<04:09, 49.99s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_16.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 80%|██████████████████████████████████▍ | 16/20 [10:50<02:43, 40.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_17.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 85%|████████████████████████████████████▌ | 17/20 [11:11<01:44, 34.98s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_15.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 90%|██████████████████████████████████████▋ | 18/20 [11:36<01:03, 31.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_14.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 95%|████████████████████████████████████████▊ | 19/20 [11:53<00:27, 27.59s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_8.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 20/20 [12:12<00:00, 36.60s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_9.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "imageList = os.listdir(image_folder)\n", + "\n", + "json_output = {}\n", + "text_output = {}\n", + "\n", + "for i in tqdm(range(len(imageList))):\n", + " image_path = Path(image_folder) / imageList[i]\n", + " base64_string = encode_image(image_path)\n", + "\n", + " message_list = get_message_list(base64_string, prompt) \n", + "\n", + " response = client.messages.create(\n", + " model=MODEL_NAME,\n", + " max_tokens=4096,\n", + " messages=message_list,\n", + " temperature = 0.0,\n", + " top_p = 1\n", + " \n", + " )\n", + "\n", + " try:\n", + " out = eval(response.content[0].text)\n", + "\n", + " json_output[i] = out \n", + " print(f\"{imageList[i]} added to json\")\n", + " except:\n", + " text_output[i] = response.content[0].text\n", + " print(f\"{imageList[i]} added to text\")\n", + " \n", + "json_backup = json_output.copy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "a308436b-d478-48e9-b476-a8c50878f912", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total no of samples : 171\n", + "Total no of samples without images: 149\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_question_numquestionoptionsimage
010वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक...[परागकोशातील परागकण, अंकुरित झालेल्या परागकण व...False
111प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण...[स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट...False
212अयोग्य विधान ओळखा :[रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां...False
313पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____...[समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि...False
414जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी...[एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत...False
\n", + "
" + ], + "text/plain": [ + " original_question_num question \\\n", + "0 10 वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक... \n", + "1 11 प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण... \n", + "2 12 अयोग्य विधान ओळखा : \n", + "3 13 पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____... \n", + "4 14 जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी... \n", + "\n", + " options image \n", + "0 [परागकोशातील परागकण, अंकुरित झालेल्या परागकण व... False \n", + "1 [स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट... False \n", + "2 [रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां... False \n", + "3 [समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि... False \n", + "4 [एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत... False " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = []\n", + "for key in json_output.keys():\n", + " output += json_output[key]\n", + "\n", + "\n", + "df = pd.DataFrame(output)\n", + "df.columns = ['original_question_num','question','options','image']\n", + "\n", + "print(f\"Total no of samples : {len(df)}\")\n", + "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea", + "metadata": {}, + "source": [ + "#### Removing samples with images/tables" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "2c118ef0-1eda-453a-ab8a-2db286928528", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df.image == False]" + ] + }, + { + "cell_type": "markdown", + "id": "5066c956-a46b-44a7-973f-ba3589a5a028", + "metadata": {}, + "source": [ + "#### Attaching Answer Key" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9", + "metadata": {}, + "outputs": [], + "source": [ + "with open(answer_file) as f:\n", + " answer_key = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n", + "\n", + "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))" + ] + }, + { + "cell_type": "markdown", + "id": "32a86338-be74-4a89-bb70-2821910f43ee", + "metadata": {}, + "source": [ + "#### Assigning categories" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19", + "metadata": {}, + "outputs": [], + "source": [ + "category_map = {'biology': 'जीवशास्त्र' ,'chemistry':'रसायनशास्त्र','physics':'भौतिकशास्त्र'}\n", + "\n", + "question_category_map = {}\n", + "for i in range(1,91):\n", + " question_category_map[i] = 'biology'\n", + "\n", + "for i in range(91,136):\n", + " question_category_map[i] = 'physics'\n", + "\n", + "for i in range(136,181):\n", + " question_category_map[i] = 'chemistry'" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "bda39801-e316-48ee-9c68-c7178935cd59", + "metadata": {}, + "outputs": [], + "source": [ + "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n", + "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])" + ] + }, + { + "cell_type": "markdown", + "id": "fd7aaff4-1f19-47fd-a52b-105870444262", + "metadata": {}, + "source": [ + "#### Assigning other metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891", + "metadata": {}, + "outputs": [], + "source": [ + "df['language'] = lang_code\n", + "df['country'] = country\n", + "df['file_name'] = file_name\n", + "df['source'] = source\n", + "df['license'] = license\n", + "df['level'] = level\n", + "\n", + "\n", + "df_ = df.copy()\n", + "df.drop('image', axis = 1, inplace = True)\n", + "\n", + "cols = ['language',\n", + " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n", + " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n", + "\n", + "df = df[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
languagecountryfile_namesourcelicenselevelcategory_encategory_original_langoriginal_question_numquestionoptionsanswer
0mrIndiaPaper_20201106084438.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवशास्त्र10वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक...[परागकोशातील परागकण, अंकुरित झालेल्या परागकण व...3
1mrIndiaPaper_20201106084438.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवशास्त्र11प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण...[स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट...1
2mrIndiaPaper_20201106084438.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवशास्त्र12अयोग्य विधान ओळखा :[रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां...2
3mrIndiaPaper_20201106084438.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवशास्त्र13पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____...[समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि...1
4mrIndiaPaper_20201106084438.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyजीवशास्त्र14जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी...[एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत...2
\n", + "
" + ], + "text/plain": [ + " language country file_name \\\n", + "0 mr India Paper_20201106084438.pdf \n", + "1 mr India Paper_20201106084438.pdf \n", + "2 mr India Paper_20201106084438.pdf \n", + "3 mr India Paper_20201106084438.pdf \n", + "4 mr India Paper_20201106084438.pdf \n", + "\n", + " source license level \\\n", + "0 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "\n", + " category_en category_original_lang original_question_num \\\n", + "0 biology जीवशास्त्र 10 \n", + "1 biology जीवशास्त्र 11 \n", + "2 biology जीवशास्त्र 12 \n", + "3 biology जीवशास्त्र 13 \n", + "4 biology जीवशास्त्र 14 \n", + "\n", + " question \\\n", + "0 वनस्पतीत खालीलपैकी कोणता शरीरभाग दोन पिढ्या एक... \n", + "1 प्लाझ्मोडिअमची मानवी शरीरात प्रवेशणारी संक्रमण... \n", + "2 अयोग्य विधान ओळखा : \n", + "3 पेंसीनचे आणि अॅल्गिननचे पिसपूर ही उदाहरणे ____... \n", + "4 जीन 'I' जो ABO रक्तगट नियंत्रण करतो त्याच्याशी... \n", + "\n", + " options answer \n", + "0 [परागकोशातील परागकण, अंकुरित झालेल्या परागकण व... 3 \n", + "1 [स्पोरोझोइट्स, मादी युग्मकेशी (मादी गॅमिटोसाइट... 1 \n", + "2 [रक्तकाष्ठ पाणी व खनिजांचे वहन मूळापासून पानां... 2 \n", + "3 [समकेंद्री उष्णती, ओलीगोसॅकरायडला, नैसर्गिक नि... 1 \n", + "4 [एका व्यक्तीमध्ये तीन पैकी दोन युग्मविकल्प असत... 2 " + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "afed88ff-1bb6-4157-934e-e72a40538a1a", + "metadata": {}, + "source": [ + "#### Save processed file" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_json(output_file_name+'.json', orient='records')\n", + "df_.to_csv(output_file_name+'.tsv',sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "161" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/misc/NEET/generator_codes/Generator3_2020_Tamil.ipynb b/misc/NEET/generator_codes/Generator3_2020_Tamil.ipynb new file mode 100644 index 0000000..5226b40 --- /dev/null +++ b/misc/NEET/generator_codes/Generator3_2020_Tamil.ipynb @@ -0,0 +1,1074 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b2729c79-56a6-41da-a49f-c1cdf92532d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/roshansk/Documents/GitHub/Global_Exams'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import base64\n", + "from openai import OpenAI\n", + "from anthropic import Anthropic\n", + "from pdf2image import convert_from_path\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "import json\n", + "\n", + "import os\n", + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320b7be4-6a54-4169-8666-0a2c56359109", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " \"\"\"\n", + " Function to encode the image\n", + " \"\"\"\n", + " with open(image_path, \"rb\") as image_file:\n", + " return base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + "\n", + "\n", + "def get_message_list(base64_string, prompt):\n", + "\n", + " message_list = [\n", + " {\n", + " \"role\": 'user',\n", + " \"content\": [\n", + " {\"type\": \"image\", \"source\": {\"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": base64_string}},\n", + " {\"type\": \"text\", \"text\": prompt}\n", + " ]\n", + " }\n", + " ]\n", + "\n", + " return message_list" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46cdee87-a3de-4796-8e26-a2a1a2c343ae", + "metadata": {}, + "outputs": [], + "source": [ + "from anthropic import Anthropic\n", + "\n", + "client = Anthropic()\n", + "\n", + "# MODEL_NAME = \"claude-3-opus-20240229\"\n", + "\n", + "MODEL_NAME = \"claude-3-5-sonnet-20240620\"\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "438dd01b-1448-4bb0-94e6-ed1a3f73b9d8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "pdf_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_Tamil_G4.pdf'\n", + "answer_file = '/Users/roshansk/Documents/GitHub/Global_Exams/data/2020/NEET_2020_G4_Answer_Key.json'\n", + "image_folder = '/Users/roshansk/Documents/GitHub/Global_Exams/images/NEET_2020_Tamil_G4'\n", + "\n", + "\n", + "lang_code = 'ta'\n", + "country = 'India'\n", + "file_name = 'Paper_20201106090359.pdf'\n", + "source = 'https://www.nta.ac.in/Download/ExamPaper/Paper_20201106090359.pdf'\n", + "license = 'open'\n", + "level = 'University'\n", + "\n", + "output_file_name = 'NEET_2020_Tamil_processed'\n", + "\n", + "pages_to_include=[2,23]" + ] + }, + { + "cell_type": "markdown", + "id": "3b26bc26-49bf-4687-a5a9-03a6832ecc18", + "metadata": {}, + "source": [ + "## Prompt (User input required. Change the prompt value to account for language)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "88ec45bd-1972-4d96-bb5d-a67b3db11981", + "metadata": {}, + "outputs": [], + "source": [ + "# prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of {lang}.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "# \"\"\"\n", + "\n", + "# prompt_hindi = \"\"\"You are given a pdf containing Hindi and English questions. Extract the Hindi multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python directly using eval(response.text)\n", + "# The questions are present in two columns with Hindi questions on the left column and English questions on the right column. Extract only the Hindi questions from the left column.\n", + "# \"\"\"\n", + "\n", + "\n", + "# prompt = \"\"\"You are given a pdf containing Gujarati and English questions. Extract the Gujarati multiple choice questions along with the options present in the image.\n", + "# Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "# The json value should have the following keys : \n", + "# - number : The number of the question\n", + "# - question : The actual text of the question\n", + "# - options : A list containing all 4 options for the question\n", + "# - image : output True if there is an image associated with the either the question or answer and the student is supposed to use an image to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "# Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "# The questions are present in two columns with Gujarati questions on the right side and English questions on the left. Extract only the Gujarati questions from the right column.\n", + "# \"\"\"\n", + "\n", + "\n", + "prompt = \"\"\"Extract the multiple choice questions along with the options present in the image. The text is mostly in the language of Tamil.\n", + "Return the extracted the questions as a list of json values, with each json value corresponding to a single question. \n", + "The json value should have the following keys : \n", + "- number : The number of the question\n", + "- question : The actual text of the question\n", + "- options : A list containing all 4 options for the question\n", + "- image : output True if there is an image or table associated with the either the question or answer and the student is supposed to use an image/table to answer the question. Output False, otherwise. Ensure that True and False have their first letters capitalized.\n", + "\n", + "Output the list alone and no other supporting text in your response. Ensure that the output can be converted into a python dict directly using eval(response.text)\n", + "The questions are present in 2 columns. Make sure to extract questions from both columns and not just one\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "0333fbaf-b80a-48e3-9826-35ee6fb1b9c7", + "metadata": {}, + "source": [ + "## Creating Image Files" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "9a59c2b0-0b87-4e37-8ec5-3b89f9e3ae44", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(image_folder):\n", + " os.makedirs(image_folder)\n", + "\n", + "pages = convert_from_path(pdf_file)\n", + "\n", + "start_page, end_page = pages_to_include \n", + "count = 0\n", + "for i in range(start_page-1, end_page):\n", + " filename = \"page_\"+str(i)+'.jpg'\n", + " pages[i].save(Path(image_folder) / filename)\n", + " count += 1\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "7c323568-4ab7-4f89-968a-fdb3f1a599ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 images created\n" + ] + } + ], + "source": [ + "print(f\"{count} images created\")" + ] + }, + { + "cell_type": "markdown", + "id": "dcbe3632-fee8-473c-a206-c417f638484b", + "metadata": {}, + "source": [ + "## Extracting questions from Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "1b4d55cc-c997-4614-8405-9b48154f8c80", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|██ | 1/22 [00:31<11:00, 31.45s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_2.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 9%|████ | 2/22 [00:59<09:46, 29.33s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_3.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 14%|██████ | 3/22 [01:21<08:16, 26.14s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_1.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 18%|████████ | 4/22 [01:48<07:55, 26.43s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_4.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 23%|██████████ | 5/22 [02:14<07:25, 26.22s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_5.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 27%|███████████▋ | 6/22 [06:41<28:49, 108.09s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_7.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 32%|██████████████ | 7/22 [06:59<19:40, 78.68s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_6.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 36%|████████████████ | 8/22 [07:26<14:30, 62.17s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_19.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 41%|██████████████████ | 9/22 [07:53<11:06, 51.27s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_18.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 45%|███████████████████▌ | 10/22 [08:04<07:44, 38.67s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_22.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 50%|█████████████████████▌ | 11/22 [08:29<06:19, 34.51s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_20.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 55%|██████████████████████▉ | 12/22 [14:32<22:25, 134.51s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_21.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 59%|████████████████████████▊ | 13/22 [15:04<15:30, 103.41s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_10.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 64%|███████████████████████████▎ | 14/22 [15:27<10:33, 79.20s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_11.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 68%|█████████████████████████████▎ | 15/22 [15:52<07:20, 62.90s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_13.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 73%|███████████████████████████████▎ | 16/22 [16:18<05:11, 51.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_12.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 77%|█████████████████████████████████▏ | 17/22 [16:44<03:39, 43.84s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_16.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 82%|██████████████████████████████████▎ | 18/22 [23:51<10:36, 159.11s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_17.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 86%|████████████████████████████████████▎ | 19/22 [24:18<05:58, 119.41s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_15.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 91%|███████████████████████████████████████ | 20/22 [24:46<03:03, 91.92s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_14.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 95%|█████████████████████████████████████████ | 21/22 [25:08<01:10, 70.92s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_8.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 22/22 [25:31<00:00, 69.61s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_9.jpg added to json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "imageList = os.listdir(image_folder)\n", + "\n", + "json_output = {}\n", + "text_output = {}\n", + "\n", + "for i in tqdm(range(len(imageList))):\n", + " image_path = Path(image_folder) / imageList[i]\n", + " base64_string = encode_image(image_path)\n", + "\n", + " message_list = get_message_list(base64_string, prompt) \n", + "\n", + " response = client.messages.create(\n", + " model=MODEL_NAME,\n", + " max_tokens=4096,\n", + " messages=message_list,\n", + " temperature = 0.0,\n", + " top_p = 1\n", + " \n", + " )\n", + "\n", + " try:\n", + " out = eval(response.content[0].text)\n", + "\n", + " json_output[i] = out \n", + " print(f\"{imageList[i]} added to json\")\n", + " except:\n", + " text_output[i] = response.content[0].text\n", + " print(f\"{imageList[i]} added to text\")\n", + " \n", + "json_backup = json_output.copy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "a308436b-d478-48e9-b476-a8c50878f912", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total no of samples : 180\n", + "Total no of samples without images: 160\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_question_numquestionoptionsimage
010சரியற்ற கூற்றை கண்டறிக.[சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந...False
111உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர...[துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க...False
212அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா...[அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி...False
313வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ?[ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந...False
414S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ...[800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ...False
\n", + "
" + ], + "text/plain": [ + " original_question_num question \\\n", + "0 10 சரியற்ற கூற்றை கண்டறிக. \n", + "1 11 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n", + "2 12 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n", + "3 13 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n", + "4 14 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n", + "\n", + " options image \n", + "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... False \n", + "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... False \n", + "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... False \n", + "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... False \n", + "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... False " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = []\n", + "for key in json_output.keys():\n", + " output += json_output[key]\n", + "\n", + "\n", + "df = pd.DataFrame(output)\n", + "df.columns = ['original_question_num','question','options','image']\n", + "\n", + "print(f\"Total no of samples : {len(df)}\")\n", + "print(f\"Total no of samples without images: {len(df[df.image==False])}\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "514615e2-2ec2-435f-b8a9-fb06d4a6dfea", + "metadata": {}, + "source": [ + "#### Removing samples with images/tables" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "2c118ef0-1eda-453a-ab8a-2db286928528", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df.image == False]" + ] + }, + { + "cell_type": "markdown", + "id": "5066c956-a46b-44a7-973f-ba3589a5a028", + "metadata": {}, + "source": [ + "#### Attaching Answer Key" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "faafde12-b46d-44dd-ac88-3b6df0cbaed9", + "metadata": {}, + "outputs": [], + "source": [ + "with open(answer_file) as f:\n", + " answer_key = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "7b7d48a9-9895-43d5-bd0d-9740b5a7e3bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df.original_question_num = df.original_question_num.apply(lambda x: x.replace(\".\",\"\")) #Cleaning\n", + "\n", + "df['answer'] = df.original_question_num.apply(lambda x : str(answer_key[x]))" + ] + }, + { + "cell_type": "markdown", + "id": "32a86338-be74-4a89-bb70-2821910f43ee", + "metadata": {}, + "source": [ + "#### Assigning categories" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "db3d9a00-3d5c-4225-9499-c4e7bc622e19", + "metadata": {}, + "outputs": [], + "source": [ + "category_map = {'biology': 'உயிரியல்' ,'chemistry':'வேதியியல்','physics':'இயற்பியல்'}\n", + "\n", + "question_category_map = {}\n", + "for i in range(1,91):\n", + " question_category_map[i] = 'biology'\n", + "\n", + "for i in range(91,136):\n", + " question_category_map[i] = 'physics'\n", + "\n", + "for i in range(136,181):\n", + " question_category_map[i] = 'chemistry'" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "bda39801-e316-48ee-9c68-c7178935cd59", + "metadata": {}, + "outputs": [], + "source": [ + "df['category_en'] = df.original_question_num.apply(lambda x : question_category_map[int(x)])\n", + "df['category_original_lang'] = df.category_en.apply(lambda x : category_map[x])" + ] + }, + { + "cell_type": "markdown", + "id": "fd7aaff4-1f19-47fd-a52b-105870444262", + "metadata": {}, + "source": [ + "#### Assigning other metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "08b17366-5538-4ebb-b5a7-b69be1e2b891", + "metadata": {}, + "outputs": [], + "source": [ + "df['language'] = lang_code\n", + "df['country'] = country\n", + "df['file_name'] = file_name\n", + "df['source'] = source\n", + "df['license'] = license\n", + "df['level'] = level\n", + "\n", + "\n", + "df_ = df.copy()\n", + "df.drop('image', axis = 1, inplace = True)\n", + "\n", + "cols = ['language',\n", + " 'country', 'file_name', 'source', 'license', 'level', 'category_en',\n", + " 'category_original_lang', 'original_question_num', 'question', 'options', 'answer']\n", + "\n", + "df = df[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "43db4b31-cedc-42cb-a8e2-7e72f11c4790", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
languagecountryfile_namesourcelicenselevelcategory_encategory_original_langoriginal_question_numquestionoptionsanswer
0taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்10சரியற்ற கூற்றை கண்டறிக.[சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந...2
1taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்11உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர...[துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க...1
2taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்12அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா...[அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி...1
3taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்13வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ?[ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந...4
4taIndiaPaper_20201106090359.pdfhttps://www.nta.ac.in/Download/ExamPaper/Paper...openUniversitybiologyஉயிரியல்14S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ...[800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ...4
\n", + "
" + ], + "text/plain": [ + " language country file_name \\\n", + "0 ta India Paper_20201106090359.pdf \n", + "1 ta India Paper_20201106090359.pdf \n", + "2 ta India Paper_20201106090359.pdf \n", + "3 ta India Paper_20201106090359.pdf \n", + "4 ta India Paper_20201106090359.pdf \n", + "\n", + " source license level \\\n", + "0 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "1 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "2 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "3 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "4 https://www.nta.ac.in/Download/ExamPaper/Paper... open University \n", + "\n", + " category_en category_original_lang original_question_num \\\n", + "0 biology உயிரியல் 10 \n", + "1 biology உயிரியல் 11 \n", + "2 biology உயிரியல் 12 \n", + "3 biology உயிரியல் 13 \n", + "4 biology உயிரியல் 14 \n", + "\n", + " question \\\n", + "0 சரியற்ற கூற்றை கண்டறிக. \n", + "1 உணவு பாதையிலுள்ள கோப்பை வடிவ செல்கள் எவற்றிலிர... \n", + "2 அண்டார்டிகா பகுதியில் பனிக்கூடு ஏற்படுவது எதனா... \n", + "3 வளர்ச்சி நிலை அதிகமாக இருப்பது எப்போது ? \n", + "4 S.L. மில்லர் தன் சோதனைகளில் மூடிய குடுவையில் இ... \n", + "\n", + " options answer \n", + "0 [சாறுக் கட்டை நீர் மற்றும் தனிமங்களை வேரிலிருந... 2 \n", + "1 [துண்டுபடி எபிதீலிய செல்கள், காண்ட்ரோசைட்டு, க... 1 \n", + "2 [அதிக அளவிலான UV-B கதிர்வீச்சின் காரணமாக கருவி... 1 \n", + "3 [ஒடுக்கப் பருவம், முதிர்ந்து உதிர்தல், உறக்க ந... 4 \n", + "4 [800°C -ல் CH₃, H₂, NH₄ மற்றும் நீராவி, 600°C ... 4 " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "afed88ff-1bb6-4157-934e-e72a40538a1a", + "metadata": {}, + "source": [ + "#### Save processed file" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "9e9353c0-f945-4070-b7c1-cb1463907b9a", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_json(output_file_name+'.json', orient='records')\n", + "df_.to_csv(output_file_name+'.tsv',sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "d955bca7-1c09-4826-9bf9-8c2da3175d02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "160" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "47ae5148-fdc5-43b1-8d12-772ef6e2ff93", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b21d6a38-ae4c-490e-baf5-abdc5253d87a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}