From 6ab0619143cd2c4a8ff2a8668cb646a5e30f203f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Sun, 3 Mar 2024 12:21:29 +0100 Subject: [PATCH] Add files via upload --- .../tutorial-query-intent-prediction.ipynb | 3636 +++++++++++++++++ 1 file changed, 3636 insertions(+) create mode 100644 tutorials/tutorial-query-intent-prediction.ipynb diff --git a/tutorials/tutorial-query-intent-prediction.ipynb b/tutorials/tutorial-query-intent-prediction.ipynb new file mode 100644 index 0000000..5f51fdf --- /dev/null +++ b/tutorials/tutorial-query-intent-prediction.ipynb @@ -0,0 +1,3636 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jpfycYr7C4DE", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e5d8b7a4-d91e-4995-9fef-f77b2b570625" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting tira\n", + " Downloading tira-0.0.105-py3-none-any.whl (48 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m799.6 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting ir_datasets\n", + " Downloading ir_datasets-0.5.6-py3-none-any.whl (335 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m335.2/335.2 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting python-terrier\n", + " Downloading python-terrier-0.10.0.tar.gz (107 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m107.6/107.6 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: requests==2.*,>=2.26 in /usr/local/lib/python3.10/dist-packages (from tira) (2.31.0)\n", + "Collecting docker==6.*,>=6.0.0 (from tira)\n", + " Downloading docker-6.1.3-py3-none-any.whl (148 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m148.1/148.1 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (1.5.3)\n", + "Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.0.7)\n", + "Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.6)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2024.2.2)\n", + "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir_datasets) (4.12.3)\n", + "Collecting inscriptis>=2.2.0 (from ir_datasets)\n", + " Downloading inscriptis-2.4.0.1-py3-none-any.whl (41 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir_datasets) (4.9.4)\n", + "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir_datasets) (1.25.2)\n", + "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir_datasets) (6.0.1)\n", + "Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.10/dist-packages (from ir_datasets) (4.66.2)\n", + "Collecting trec-car-tools>=2.5.4 (from ir_datasets)\n", + " Downloading trec_car_tools-2.6-py3-none-any.whl (8.4 kB)\n", + "Collecting lz4>=3.1.10 (from ir_datasets)\n", + " Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting warc3-wet>=0.2.3 (from ir_datasets)\n", + " Downloading warc3_wet-0.2.3-py3-none-any.whl (13 kB)\n", + "Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)\n", + " Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting zlib-state>=0.1.3 (from ir_datasets)\n", + " Downloading zlib-state-0.1.6.tar.gz (9.5 kB)\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting ijson>=3.1.3 (from ir_datasets)\n", + " Downloading ijson-3.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (111 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.8/111.8 kB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pyautocorpus>=0.1.1 (from ir_datasets)\n", + " Downloading pyautocorpus-0.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (379 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m379.9/379.9 kB\u001b[0m \u001b[31m39.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting unlzw3>=0.2.1 (from ir_datasets)\n", + " Downloading unlzw3-0.2.2-py3-none-any.whl (6.1 kB)\n", + "Collecting wget (from python-terrier)\n", + " Downloading wget-3.2.zip (10 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting pyjnius>=1.4.2 (from python-terrier)\n", + " Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m53.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting matchpy (from python-terrier)\n", + " Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.6/69.6 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.2)\n", + "Collecting deprecated (from python-terrier)\n", + " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", + "Collecting chest (from python-terrier)\n", + " Downloading chest-0.2.3.tar.gz (9.6 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", + "Collecting nptyping==1.4.4 (from python-terrier)\n", + " Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: more_itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.3)\n", + "Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.1)\n", + "Collecting ir_measures>=0.3.1 (from python-terrier)\n", + " Downloading ir_measures-0.3.3.tar.gz (48 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.8/48.8 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting dill (from python-terrier)\n", + " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pytrec_eval_terrier>=0.5.3 (from python-terrier)\n", + " Downloading pytrec_eval_terrier-0.5.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (287 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.4/287.4 kB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typish>=1.7.0 (from nptyping==1.4.4->python-terrier)\n", + " Downloading typish-1.9.3-py3-none-any.whl (45 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.1/45.1 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir_datasets) (2.5)\n", + "Collecting cwl-eval>=1.0.10 (from ir_measures>=0.3.1->python-terrier)\n", + " Downloading cwl-eval-1.0.12.tar.gz (31 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting cbor>=1.0.0 (from trec-car-tools>=2.5.4->ir_datasets)\n", + " Downloading cbor-1.0.0.tar.gz (20 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting heapdict (from chest->python-terrier)\n", + " Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.14.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.5)\n", + "Collecting multiset<3.0,>=2.0 (from matchpy->python-terrier)\n", + " Downloading multiset-2.1.1-py2.py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.4)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.3.0)\n", + "Requirement already satisfied: patsy>=0.5.4 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.6)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.4->statsmodels->python-terrier) (1.16.0)\n", + "Building wheels for collected packages: python-terrier, ir_measures, warc3-wet-clueweb09, zlib-state, chest, wget, cbor, cwl-eval\n", + " Building wheel for python-terrier (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for python-terrier: filename=python_terrier-0.10.0-py3-none-any.whl size=115532 sha256=394e151155af25c148855420bbb95bf82d6a7401fdd510492d95acdf8ad2600a\n", + " Stored in directory: /root/.cache/pip/wheels/79/7c/8f/679a982895c53af35178eceda648a4bc9a9af6af5542e31a0e\n", + " Building wheel for ir_measures (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ir_measures: filename=ir_measures-0.3.3-py3-none-any.whl size=61182 sha256=efceeb3dedfbdbcd815fc5022cc080d311ef7bc21560e996e9c1ec789f7c243e\n", + " Stored in directory: /root/.cache/pip/wheels/9f/0e/22/718279f23fef1673a4c5e433881c25080a6afaa147e007183e\n", + " Building wheel for warc3-wet-clueweb09 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for warc3-wet-clueweb09: filename=warc3_wet_clueweb09-0.2.5-py3-none-any.whl size=18919 sha256=97ad3a19708d56e219717652b55b07748d9046edd51cae8e08eae35c020e9721\n", + " Stored in directory: /root/.cache/pip/wheels/1a/d7/91/7ffb991df87e62355d945745035470ba2616aa3d83a250b5f9\n", + " Building wheel for zlib-state (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for zlib-state: filename=zlib_state-0.1.6-cp310-cp310-linux_x86_64.whl size=21158 sha256=5f43716c70002e31647a91a1cd4629adb01f6c73e53495b9e4470f32578a2023\n", + " Stored in directory: /root/.cache/pip/wheels/32/72/7e/aff80f26e926b6e1fb08dfb52aba03c0e058f5e2258deb50a9\n", + " Building wheel for chest (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for chest: filename=chest-0.2.3-py3-none-any.whl size=7612 sha256=30baf75cca816ccf270026c749b5294a46516f4aec851937b599a59f62bf5bc5\n", + " Stored in directory: /root/.cache/pip/wheels/88/cf/99/4773b31f855f9ecedc32a0ae400f7a4a3001b37c439b6d1a73\n", + " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=a04575a6039fcba746c760548101d89b83a2ee298f34cf823fa819a9d378095c\n", + " Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769\n", + " Building wheel for cbor (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for cbor: filename=cbor-1.0.0-cp310-cp310-linux_x86_64.whl size=53433 sha256=35d4f5582fbb82c24564193697f891573a06efafbe2ab461776c866f37532f1a\n", + " Stored in directory: /root/.cache/pip/wheels/85/df/c9/b39e40eccaf76dbd218556639a6dc81562226f4c6a64902c85\n", + " Building wheel for cwl-eval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for cwl-eval: filename=cwl_eval-1.0.12-py3-none-any.whl size=38068 sha256=42074228fad8cf2b44e7256ee22f6198d695db62cf31accce8b16d5fc4403a78\n", + " Stored in directory: /root/.cache/pip/wheels/3d/c1/94/94a3e5379b1aa8fb7c7f1ad1956305d5edc98ef745b6067d87\n", + "Successfully built python-terrier ir_measures warc3-wet-clueweb09 zlib-state chest wget cbor cwl-eval\n", + "Installing collected packages: wget, warc3-wet-clueweb09, warc3-wet, typish, pyjnius, multiset, ijson, heapdict, cbor, zlib-state, unlzw3, trec-car-tools, pytrec_eval_terrier, pyautocorpus, nptyping, matchpy, lz4, dill, deprecated, cwl-eval, chest, ir_measures, inscriptis, docker, tira, ir_datasets, python-terrier\n", + "Successfully installed cbor-1.0.0 chest-0.2.3 cwl-eval-1.0.12 deprecated-1.2.14 dill-0.3.8 docker-6.1.3 heapdict-1.0.1 ijson-3.2.3 inscriptis-2.4.0.1 ir_datasets-0.5.6 ir_measures-0.3.3 lz4-4.3.3 matchpy-0.5.5 multiset-2.1.1 nptyping-1.4.4 pyautocorpus-0.1.12 pyjnius-1.6.1 python-terrier-0.10.0 pytrec_eval_terrier-0.5.6 tira-0.0.105 trec-car-tools-2.6 typish-1.9.3 unlzw3-0.2.2 warc3-wet-0.2.3 warc3-wet-clueweb09-0.2.5 wget-3.2 zlib-state-0.1.6\n" + ] + } + ], + "source": [ + "!pip3 install tira ir_datasets python-terrier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eiUPE4X_C_kT", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f2024f17-7c58-420e-e6c0-afe60a2af4ac" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "terrier-assemblies 5.7 jar-with-dependencies not found, downloading to /root/.pyterrier...\n", + "Done\n", + "terrier-python-helper 0.0.7 jar not found, downloading to /root/.pyterrier...\n", + "Done\n", + "terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...\n", + "Done\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTerrier 0.10.0 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7\n", + "\n" + ] + } + ], + "source": [ + "import pyterrier as pt\n", + "from tira.third_party_integrations import ensure_pyterrier_is_loaded\n", + "from tira.rest_api_client import Client\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "ensure_pyterrier_is_loaded()\n", + "tira = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kxPNnd9sDXPR", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "756c3027-c493-4cfa-e5c0-9f2dd5973a1e" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " qid query\n", + "0 3990512 how can we get concentration onsomething\n", + "1 714612 why doesn t the water fall off earth if it s r...\n", + "2 2528767 how do i determine the charge of the iron ion ...\n", + "3 821387 i have mice how do i get rid of them humanely\n", + "4 1880028 what does see leaflet mean on ept pregnancy test" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidquery
03990512how can we get concentration onsomething
1714612why doesn t the water fall off earth if it s r...
22528767how do i determine the charge of the iron ion ...
3821387i have mice how do i get rid of them humanely
41880028what does see leaflet mean on ept pregnancy test
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"pt_dataset\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"714612\",\n \"1880028\",\n \"2528767\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"why doesn t the water fall off earth if it s round\",\n \"what does see leaflet mean on ept pregnancy test\",\n \"how do i determine the charge of the iron ion in fecl3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "dataset = 'antique-test-20230107-training'\n", + "pt_dataset = pt.get_dataset(f\"irds:ir-benchmarks/{dataset}\")\n", + "\n", + "pt_dataset.get_topics('query').head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "L2jagIm2EFYs", + "outputId": "4801d40b-683c-41c3-9b44-0722d604ce92" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " qid query \\\n", + "0 3990512 how can we get concentration onsomething \n", + "1 714612 why doesn t the water fall off earth if it s r... \n", + "2 2528767 how do i determine the charge of the iron ion ... \n", + "3 821387 i have mice how do i get rid of them humanely \n", + "4 1880028 what does see leaflet mean on ept pregnancy test \n", + "\n", + " intent_prediction \n", + "0 Instrumental \n", + "1 Factual \n", + "2 Abstain \n", + "3 Navigational \n", + "4 Factual " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidqueryintent_prediction
03990512how can we get concentration onsomethingInstrumental
1714612why doesn t the water fall off earth if it s r...Factual
22528767how do i determine the charge of the iron ion ...Abstain
3821387i have mice how do i get rid of them humanelyNavigational
41880028what does see leaflet mean on ept pregnancy testFactual
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"intent_prediction(pt_dataset\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"714612\",\n \"1880028\",\n \"2528767\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"why doesn t the water fall off earth if it s round\",\n \"what does see leaflet mean on ept pregnancy test\",\n \"how do i determine the charge of the iron ion in fecl3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"intent_prediction\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Factual\",\n \"Navigational\",\n \"Instrumental\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset)\n", + "intent_prediction(pt_dataset.get_topics('query').head())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1goPEbxtJ2ZP" + }, + "source": [ + "# Analyse the pre-retrieval intent predictors (that dont use the URL) on all datasets\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I7lrIHE3J0-S", + "outputId": "0ea4c87f-d94e-486d-942b-9be1ff79e2cd" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "37\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 37/37 [00:00<00:00, 41.93it/s]\n" + ] + } + ], + "source": [ + "ir_datasets = [\n", + " 'antique-test-20230107-training', 'argsme-touche-2021-task-1-20230209-training', 'argsme-touche-2020-task-1-20230209-training',\n", + " 'clueweb09-en-trec-web-2009-20230107-training', 'clueweb09-en-trec-web-2010-20230107-training', 'clueweb09-en-trec-web-2011-20230107-training',\n", + " 'clueweb09-en-trec-web-2012-20230107-training', 'clueweb12-touche-2020-task-2-20230209-training', 'clueweb12-touche-2021-task-2-20230209-training',\n", + " 'clueweb12-trec-misinfo-2019-20240214-training', 'clueweb12-trec-web-2013-20230107-training', 'clueweb12-trec-web-2014-20230107-training',\n", + " 'cord19-fulltext-trec-covid-20230107-training', 'cranfield-20230107-training', 'disks45-nocr-trec-robust-2004-20230209-training',\n", + " 'disks45-nocr-trec7-20230209-training', 'disks45-nocr-trec8-20230209-training', 'gov-trec-web-2002-20230209-training',\n", + " 'gov-trec-web-2003-20230209-training', 'gov-trec-web-2004-20230209-training', 'gov2-trec-tb-2004-20230209-training',\n", + " 'gov2-trec-tb-2005-20230209-training', 'gov2-trec-tb-2006-20230209-training', 'longeval-heldout-20230513-training',\n", + " 'longeval-long-september-20230513-training', 'longeval-short-july-20230513-training', 'longeval-train-20230513-training',\n", + " 'medline-2004-trec-genomics-2004-20230107-training', 'medline-2004-trec-genomics-2005-20230107-training', 'medline-2017-trec-pm-2017-20230211-training',\n", + " 'medline-2017-trec-pm-2018-20230211-training', 'msmarco-passage-trec-dl-2019-judged-20230107-training', 'msmarco-passage-trec-dl-2020-judged-20230107-training',\n", + " 'nfcorpus-test-20230107-training', 'trec-tip-of-the-tongue-dev-20230607-training', 'vaswani-20230107-training',\n", + " 'wapo-v2-trec-core-2018-20230107-training'\n", + "]\n", + "print(len(ir_datasets))\n", + "df_all = []\n", + "\n", + "for dataset in tqdm(ir_datasets):\n", + " pt_dataset = pt.get_dataset(f\"irds:ir-benchmarks/{dataset}\")\n", + " intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset)\n", + " df = intent_prediction(pt_dataset.get_topics('text'))\n", + " df['dataset'] = dataset\n", + " df_all += [df]\n", + "\n", + "df_all = pd.concat(df_all)\n", + "df_all = df_all[['dataset', 'qid', 'query', 'intent_prediction']]" + ] + }, + { + "cell_type": "code", + "source": [ + "df_fact = df_all.loc[df_all[\"intent_prediction\"] == \"Factual\"]\n", + "#df_navig['intent_prediction'].value_counts(normalize=True) * 100\n", + "df_fact\n", + "df_fact_d = df_fact.loc[df_fact[\"dataset\"] == 'clueweb12-trec-web-2013-20230107-training']\n", + "df_fact_d" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 332 + }, + "id": "Q4VwGJjf6vKh", + "outputId": "4989bbf3-d8d0-487b-e5df-568158d496a4" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " dataset qid \\\n", + "4 clueweb12-trec-web-2013-20230107-training 205 \n", + "7 clueweb12-trec-web-2013-20230107-training 208 \n", + "10 clueweb12-trec-web-2013-20230107-training 211 \n", + "18 clueweb12-trec-web-2013-20230107-training 219 \n", + "20 clueweb12-trec-web-2013-20230107-training 221 \n", + "28 clueweb12-trec-web-2013-20230107-training 229 \n", + "30 clueweb12-trec-web-2013-20230107-training 231 \n", + "40 clueweb12-trec-web-2013-20230107-training 241 \n", + "41 clueweb12-trec-web-2013-20230107-training 242 \n", + "\n", + " query intent_prediction \n", + "4 average charitable donation Factual \n", + "7 doctor zhivago Factual \n", + "10 what is madagascar known for Factual \n", + "18 what was the name of elvis presley s home Factual \n", + "20 electoral college 2008 results Factual \n", + "28 beef stroganoff recipe Factual \n", + "30 what are the seven deadly sins Factual \n", + "40 what is a wiki Factual \n", + "41 cannellini beans Factual " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasetqidqueryintent_prediction
4clueweb12-trec-web-2013-20230107-training205average charitable donationFactual
7clueweb12-trec-web-2013-20230107-training208doctor zhivagoFactual
10clueweb12-trec-web-2013-20230107-training211what is madagascar known forFactual
18clueweb12-trec-web-2013-20230107-training219what was the name of elvis presley s homeFactual
20clueweb12-trec-web-2013-20230107-training221electoral college 2008 resultsFactual
28clueweb12-trec-web-2013-20230107-training229beef stroganoff recipeFactual
30clueweb12-trec-web-2013-20230107-training231what are the seven deadly sinsFactual
40clueweb12-trec-web-2013-20230107-training241what is a wikiFactual
41clueweb12-trec-web-2013-20230107-training242cannellini beansFactual
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_fact_d", + "summary": "{\n \"name\": \"df_fact_d\",\n \"rows\": 9,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"clueweb12-trec-web-2013-20230107-training\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"241\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"what is a wiki\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"intent_prediction\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Factual\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 96 + } + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "6Obbf80ELrmo", + "outputId": "19bd42de-7e9d-490d-8d00-6e49c7dffb42" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " dataset qid \\\n", + "0 antique-test-20230107-training 3990512 \n", + "1 antique-test-20230107-training 714612 \n", + "2 antique-test-20230107-training 2528767 \n", + "3 antique-test-20230107-training 821387 \n", + "4 antique-test-20230107-training 1880028 \n", + ".. ... ... \n", + "45 wapo-v2-trec-core-2018-20230107-training 821 \n", + "46 wapo-v2-trec-core-2018-20230107-training 822 \n", + "47 wapo-v2-trec-core-2018-20230107-training 823 \n", + "48 wapo-v2-trec-core-2018-20230107-training 824 \n", + "49 wapo-v2-trec-core-2018-20230107-training 825 \n", + "\n", + " query intent_prediction \n", + "0 how can we get concentration onsomething Instrumental \n", + "1 why doesn t the water fall off earth if it s r... Factual \n", + "2 how do i determine the charge of the iron ion ... Abstain \n", + "3 i have mice how do i get rid of them humanely Navigational \n", + "4 what does see leaflet mean on ept pregnancy test Factual \n", + ".. ... ... \n", + "45 email scams Abstain \n", + "46 sony cyberattack Abstain \n", + "47 control of mrsa Instrumental \n", + "48 bezos purchases washington post Abstain \n", + "49 ethanol and food prices Abstain \n", + "\n", + "[5320 rows x 4 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasetqidqueryintent_prediction
0antique-test-20230107-training3990512how can we get concentration onsomethingInstrumental
1antique-test-20230107-training714612why doesn t the water fall off earth if it s r...Factual
2antique-test-20230107-training2528767how do i determine the charge of the iron ion ...Abstain
3antique-test-20230107-training821387i have mice how do i get rid of them humanelyNavigational
4antique-test-20230107-training1880028what does see leaflet mean on ept pregnancy testFactual
...............
45wapo-v2-trec-core-2018-20230107-training821email scamsAbstain
46wapo-v2-trec-core-2018-20230107-training822sony cyberattackAbstain
47wapo-v2-trec-core-2018-20230107-training823control of mrsaInstrumental
48wapo-v2-trec-core-2018-20230107-training824bezos purchases washington postAbstain
49wapo-v2-trec-core-2018-20230107-training825ethanol and food pricesAbstain
\n", + "

5320 rows × 4 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_all", + "summary": "{\n \"name\": \"df_all\",\n \"rows\": 5320,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 37,\n \"samples\": [\n \"gov-trec-web-2002-20230209-training\",\n \"cranfield-20230107-training\",\n \"clueweb09-en-trec-web-2010-20230107-training\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4003,\n \"samples\": [\n \"633\",\n \"q072226155\",\n \"q07222603\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4484,\n \"samples\": [\n \"how can i become a playa\",\n \"nuts\",\n \"what are the general effects on flow fields when the reynolds number is small\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"intent_prediction\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Factual\",\n \"Transactional\",\n \"Abstain\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 77 + } + ], + "source": [ + "df_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OYf57UfRL8Ob", + "outputId": "af27095d-5035-4b63-aabb-b88026ebfe08" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Abstain 72.274436\n", + "Factual 17.124060\n", + "Instrumental 5.018797\n", + "Transactional 4.530075\n", + "Navigational 1.052632\n", + "Name: intent_prediction, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 79 + } + ], + "source": [ + "df_all['intent_prediction'].value_counts(normalize=True) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9bQESjpKMhBL" + }, + "outputs": [], + "source": [ + "# additional ideas: query intent distribution per dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SMEdIw52Jsv7" + }, + "outputs": [], + "source": [ + "# Idea for evaluation: Evaluate if different retrieval models are good for different types of queries. Are system rankings reproducible between different qeuer intents?\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 442 + }, + "id": "YjMyn2xsMojQ", + "outputId": "615f7417-562e-4548-eb6c-6367ca77b5cb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " name qid measure value\n", + "70 BM25 100653 ndcg_cut.10 0.428160\n", + "85 BM25 1015624 ndcg_cut.10 0.580570\n", + "43 BM25 1017690 ndcg_cut.10 0.754357\n", + "116 BM25 1035857 ndcg_cut.10 0.652168\n", + "122 BM25 103830 ndcg_cut.10 0.530858\n", + ".. ... ... ... ...\n", + "219 colBERT 851124 ndcg_cut.10 0.743433\n", + "295 colBERT 896725 ndcg_cut.10 0.606536\n", + "304 colBERT 922849 ndcg_cut.10 0.506877\n", + "378 colBERT 949154 ndcg_cut.10 0.816732\n", + "342 colBERT 953489 ndcg_cut.10 0.097348\n", + "\n", + "[600 rows x 4 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameqidmeasurevalue
70BM25100653ndcg_cut.100.428160
85BM251015624ndcg_cut.100.580570
43BM251017690ndcg_cut.100.754357
116BM251035857ndcg_cut.100.652168
122BM25103830ndcg_cut.100.530858
...............
219colBERT851124ndcg_cut.100.743433
295colBERT896725ndcg_cut.100.606536
304colBERT922849ndcg_cut.100.506877
378colBERT949154ndcg_cut.100.816732
342colBERT953489ndcg_cut.100.097348
\n", + "

600 rows × 4 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_eval", + "summary": "{\n \"name\": \"df_eval\",\n \"rows\": 600,\n \"fields\": [\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"BM25\",\n \"MonoT5\",\n \"colBERT\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 200,\n \"samples\": [\n \"2864267\",\n \"1287437\",\n \"1623623\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"measure\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ndcg_cut.10\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.21537360947862202,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 572,\n \"samples\": [\n 0.45542313524021955\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 38 + } + ], + "source": [ + "dataset = 'antique-test-20230107-training'\n", + "pt_dataset = pt.get_dataset(f\"irds:ir-benchmarks/{dataset}\")\n", + "\n", + "pt_dataset.get_topics('query').head()\n", + "\n", + "bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)\n", + "colbert = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/ColBERT Re-Rank (tira-ir-starter-pyterrier)', dataset)\n", + "monot5 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/MonoT5 Base (tira-ir-starter-gygaggle)', dataset)\n", + "\n", + "df_eval = pt.Experiment([bm25, colbert, monot5], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut.10'], names=['BM25', 'colBERT', 'MonoT5'], perquery=True)\n", + "df_eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "MZHR3OeDYJEG", + "outputId": "526f6537-ea81-4b9d-b854-d1bb63ea736b" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " name qid measure value intent_prediction\n", + "70 BM25 100653 ndcg_cut.10 0.428160 Instrumental\n", + "85 BM25 1015624 ndcg_cut.10 0.580570 Factual\n", + "43 BM25 1017690 ndcg_cut.10 0.754357 Factual\n", + "116 BM25 1035857 ndcg_cut.10 0.652168 Factual\n", + "122 BM25 103830 ndcg_cut.10 0.530858 Factual\n", + ".. ... ... ... ... ...\n", + "219 colBERT 851124 ndcg_cut.10 0.743433 Factual\n", + "295 colBERT 896725 ndcg_cut.10 0.606536 Factual\n", + "304 colBERT 922849 ndcg_cut.10 0.506877 Factual\n", + "378 colBERT 949154 ndcg_cut.10 0.816732 Factual\n", + "342 colBERT 953489 ndcg_cut.10 0.097348 Factual\n", + "\n", + "[600 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameqidmeasurevalueintent_prediction
70BM25100653ndcg_cut.100.428160Instrumental
85BM251015624ndcg_cut.100.580570Factual
43BM251017690ndcg_cut.100.754357Factual
116BM251035857ndcg_cut.100.652168Factual
122BM25103830ndcg_cut.100.530858Factual
..................
219colBERT851124ndcg_cut.100.743433Factual
295colBERT896725ndcg_cut.100.606536Factual
304colBERT922849ndcg_cut.100.506877Factual
378colBERT949154ndcg_cut.100.816732Factual
342colBERT953489ndcg_cut.100.097348Factual
\n", + "

600 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_eval", + "summary": "{\n \"name\": \"df_eval\",\n \"rows\": 600,\n \"fields\": [\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"BM25\",\n \"MonoT5\",\n \"colBERT\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 200,\n \"samples\": [\n \"2864267\",\n \"1287437\",\n \"1623623\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"measure\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"ndcg_cut.10\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.21537360947862202,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 572,\n \"samples\": [\n 0.45542313524021955\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"intent_prediction\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Factual\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "# now join this with intent predictions\n", + "df_eval\n", + "intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset)\n", + "intent_prediction(df_eval)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-xxK4GIrYahQ" + }, + "source": [ + "# Post retrieval Query Intent prediction (when the URL of the top-result is used for prediction)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IinkkzN1Ygt-", + "outputId": "4f666439-bd46-47e1-a82c-a4f4bf103af0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 11/11 [00:00<00:00, 72.93it/s]\n" + ] + } + ], + "source": [ + "# only the web retrieval datasets have an url for documents\n", + "ir_datasets = [\n", + " 'clueweb09-en-trec-web-2009-20230107-training', 'clueweb09-en-trec-web-2010-20230107-training', 'clueweb09-en-trec-web-2011-20230107-training',\n", + " 'clueweb09-en-trec-web-2012-20230107-training', 'clueweb12-trec-web-2013-20230107-training', 'clueweb12-trec-web-2014-20230107-training',\n", + " 'gov-trec-web-2002-20230209-training', 'gov-trec-web-2003-20230209-training', 'gov2-trec-tb-2004-20230209-training',\n", + " 'gov2-trec-tb-2005-20230209-training', 'gov2-trec-tb-2006-20230209-training',\n", + "]\n", + "df_all_url = []\n", + "\n", + "for dataset in tqdm(ir_datasets):\n", + " pt_dataset = pt.get_dataset(f\"irds:ir-benchmarks/{dataset}\")\n", + " intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/post-retrieval-query-intent', dataset)\n", + " df = intent_prediction(pt_dataset.get_topics('text'))\n", + " df['dataset'] = dataset\n", + " df_all_url += [df]\n", + "\n", + "df_all_url = pd.concat(df_all_url)\n", + "df_all_url = df_all_url[['dataset', 'qid', 'query', 'intent_prediction']]" + ] + }, + { + "cell_type": "code", + "source": [ + "df_clueweb11 = df_all_url.loc[df_all[\"dataset\"] == \"clueweb12-trec-web-2014-20230107-training\"]\n", + "df_clueweb11[\"intent_prediction\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kDbNOBA4CrgC", + "outputId": "4bd8b340-445b-464b-e0fe-daa05388915c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Abstain 40\n", + "Instrumental 5\n", + "Factual 3\n", + "Navigational 2\n", + "Name: intent_prediction, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 130 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#compare to pre_retrieval\n", + "ir_datasets = [\n", + " 'clueweb09-en-trec-web-2009-20230107-training', 'clueweb09-en-trec-web-2010-20230107-training', 'clueweb09-en-trec-web-2011-20230107-training',\n", + " 'clueweb09-en-trec-web-2012-20230107-training', 'clueweb12-trec-web-2013-20230107-training', 'clueweb12-trec-web-2014-20230107-training',\n", + " 'gov-trec-web-2002-20230209-training', 'gov-trec-web-2003-20230209-training', 'gov2-trec-tb-2004-20230209-training',\n", + " 'gov2-trec-tb-2005-20230209-training', 'gov2-trec-tb-2006-20230209-training',\n", + "]\n", + "df_all = []\n", + "\n", + "for dataset in tqdm(ir_datasets):\n", + " pt_dataset = pt.get_dataset(f\"irds:ir-benchmarks/{dataset}\")\n", + " intent_prediction = tira.pt.transform_queries('ir-benchmarks/dossier/pre-retrieval-query-intent', dataset)\n", + " df = intent_prediction(pt_dataset.get_topics('text'))\n", + " df['dataset'] = dataset\n", + " df_all += [df]\n", + "\n", + "df_all= pd.concat(df_all)\n", + "df_all = df_all[['dataset', 'qid', 'query', 'intent_prediction']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pgUnOBBPCbNq", + "outputId": "9ca0a3f3-1f1c-45f8-a17f-0d02d186d285" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 11/11 [00:00<00:00, 75.67it/s]\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "82r2yOXVZMwC", + "outputId": "f5d4ac15-5e1a-4d2e-d4a3-be2695830304" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Abstain 88.000000\n", + "Instrumental 5.454545\n", + "Factual 5.454545\n", + "Transactional 0.727273\n", + "Navigational 0.363636\n", + "Name: intent_prediction, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 74 + } + ], + "source": [ + "df_all[\"intent_prediction\"].value_counts(normalize=True) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "OcwdFmc1ZO-c", + "outputId": "5f406dac-78cf-4352-b685-5f758f188c7c" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " dataset qid \\\n", + "0 clueweb12-trec-web-2014-20230107-training 251 \n", + "1 clueweb12-trec-web-2014-20230107-training 252 \n", + "2 clueweb12-trec-web-2014-20230107-training 253 \n", + "3 clueweb12-trec-web-2014-20230107-training 254 \n", + "4 clueweb12-trec-web-2014-20230107-training 255 \n", + "5 clueweb12-trec-web-2014-20230107-training 256 \n", + "6 clueweb12-trec-web-2014-20230107-training 257 \n", + "7 clueweb12-trec-web-2014-20230107-training 258 \n", + "8 clueweb12-trec-web-2014-20230107-training 259 \n", + "9 clueweb12-trec-web-2014-20230107-training 260 \n", + "10 clueweb12-trec-web-2014-20230107-training 261 \n", + "11 clueweb12-trec-web-2014-20230107-training 262 \n", + "12 clueweb12-trec-web-2014-20230107-training 263 \n", + "13 clueweb12-trec-web-2014-20230107-training 264 \n", + "14 clueweb12-trec-web-2014-20230107-training 265 \n", + "15 clueweb12-trec-web-2014-20230107-training 266 \n", + "16 clueweb12-trec-web-2014-20230107-training 267 \n", + "17 clueweb12-trec-web-2014-20230107-training 268 \n", + "18 clueweb12-trec-web-2014-20230107-training 269 \n", + "19 clueweb12-trec-web-2014-20230107-training 270 \n", + "20 clueweb12-trec-web-2014-20230107-training 271 \n", + "21 clueweb12-trec-web-2014-20230107-training 272 \n", + "22 clueweb12-trec-web-2014-20230107-training 273 \n", + "23 clueweb12-trec-web-2014-20230107-training 274 \n", + "24 clueweb12-trec-web-2014-20230107-training 275 \n", + "25 clueweb12-trec-web-2014-20230107-training 276 \n", + "26 clueweb12-trec-web-2014-20230107-training 277 \n", + "27 clueweb12-trec-web-2014-20230107-training 278 \n", + "28 clueweb12-trec-web-2014-20230107-training 279 \n", + "29 clueweb12-trec-web-2014-20230107-training 280 \n", + "30 clueweb12-trec-web-2014-20230107-training 281 \n", + "31 clueweb12-trec-web-2014-20230107-training 282 \n", + "32 clueweb12-trec-web-2014-20230107-training 283 \n", + "33 clueweb12-trec-web-2014-20230107-training 284 \n", + "34 clueweb12-trec-web-2014-20230107-training 285 \n", + "35 clueweb12-trec-web-2014-20230107-training 286 \n", + "36 clueweb12-trec-web-2014-20230107-training 287 \n", + "37 clueweb12-trec-web-2014-20230107-training 288 \n", + "38 clueweb12-trec-web-2014-20230107-training 289 \n", + "39 clueweb12-trec-web-2014-20230107-training 290 \n", + "40 clueweb12-trec-web-2014-20230107-training 291 \n", + "41 clueweb12-trec-web-2014-20230107-training 292 \n", + "42 clueweb12-trec-web-2014-20230107-training 293 \n", + "43 clueweb12-trec-web-2014-20230107-training 294 \n", + "44 clueweb12-trec-web-2014-20230107-training 295 \n", + "45 clueweb12-trec-web-2014-20230107-training 296 \n", + "46 clueweb12-trec-web-2014-20230107-training 297 \n", + "47 clueweb12-trec-web-2014-20230107-training 298 \n", + "48 clueweb12-trec-web-2014-20230107-training 299 \n", + "49 clueweb12-trec-web-2014-20230107-training 300 \n", + "\n", + " query intent_prediction \n", + "0 identifying spider bites Instrumental \n", + "1 history of orcas island Abstain \n", + "2 tooth abscess Abstain \n", + "3 barrett s esophagus Abstain \n", + "4 teddy bears Abstain \n", + "5 patron saint of mental illness Abstain \n", + "6 holes by louis sachar Abstain \n", + "7 hip roof Abstain \n", + "8 carpenter bee Abstain \n", + "9 the american revolutionary Abstain \n", + "10 folk remedies sore throat Abstain \n", + "11 balding cure Abstain \n", + "12 evidence for evolution Abstain \n", + "13 tribe formerly living in alabama Abstain \n", + "14 f5 tornado Factual \n", + "15 symptoms of heart attack Abstain \n", + "16 feliz navidad lyrics Abstain \n", + "17 benefits of running Abstain \n", + "18 marshall county schools Abstain \n", + "19 sun tzu Abstain \n", + "20 halloween activities for middle school Abstain \n", + "21 dreams interpretation Abstain \n", + "22 wilson s disease Abstain \n", + "23 golf instruction Abstain \n", + "24 uss cole Abstain \n", + "25 how has african american music influence history Factual \n", + "26 bewitched cast Abstain \n", + "27 mister rogers Abstain \n", + "28 game theory Abstain \n", + "29 view my internet history Abstain \n", + "30 ketogenic diet Abstain \n", + "31 nasa interplanetary missions Abstain \n", + "32 hayrides in pa Abstain \n", + "33 where to find morel mushrooms Factual \n", + "34 magnesium rich foods Abstain \n", + "35 common schizophrenia drugs Abstain \n", + "36 carotid cavernous fistula treatment Abstain \n", + "37 fidel castro Abstain \n", + "38 benefits of yoga Abstain \n", + "39 norway spruce Abstain \n", + "40 sangre de cristo mountains Abstain \n", + "41 history of the electronic medical record Abstain \n", + "42 educational advantages of social networking sites Abstain \n", + "43 flowering plants Instrumental \n", + "44 how to tie a windsor knot Instrumental \n", + "45 recycling lead acid batteries Instrumental \n", + "46 altitude sickness Abstain \n", + "47 medical care and jehovah s witnesses Abstain \n", + "48 pink slime in ground beef Abstain \n", + "49 how to find the mean Instrumental " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasetqidqueryintent_prediction
0clueweb12-trec-web-2014-20230107-training251identifying spider bitesInstrumental
1clueweb12-trec-web-2014-20230107-training252history of orcas islandAbstain
2clueweb12-trec-web-2014-20230107-training253tooth abscessAbstain
3clueweb12-trec-web-2014-20230107-training254barrett s esophagusAbstain
4clueweb12-trec-web-2014-20230107-training255teddy bearsAbstain
5clueweb12-trec-web-2014-20230107-training256patron saint of mental illnessAbstain
6clueweb12-trec-web-2014-20230107-training257holes by louis sacharAbstain
7clueweb12-trec-web-2014-20230107-training258hip roofAbstain
8clueweb12-trec-web-2014-20230107-training259carpenter beeAbstain
9clueweb12-trec-web-2014-20230107-training260the american revolutionaryAbstain
10clueweb12-trec-web-2014-20230107-training261folk remedies sore throatAbstain
11clueweb12-trec-web-2014-20230107-training262balding cureAbstain
12clueweb12-trec-web-2014-20230107-training263evidence for evolutionAbstain
13clueweb12-trec-web-2014-20230107-training264tribe formerly living in alabamaAbstain
14clueweb12-trec-web-2014-20230107-training265f5 tornadoFactual
15clueweb12-trec-web-2014-20230107-training266symptoms of heart attackAbstain
16clueweb12-trec-web-2014-20230107-training267feliz navidad lyricsAbstain
17clueweb12-trec-web-2014-20230107-training268benefits of runningAbstain
18clueweb12-trec-web-2014-20230107-training269marshall county schoolsAbstain
19clueweb12-trec-web-2014-20230107-training270sun tzuAbstain
20clueweb12-trec-web-2014-20230107-training271halloween activities for middle schoolAbstain
21clueweb12-trec-web-2014-20230107-training272dreams interpretationAbstain
22clueweb12-trec-web-2014-20230107-training273wilson s diseaseAbstain
23clueweb12-trec-web-2014-20230107-training274golf instructionAbstain
24clueweb12-trec-web-2014-20230107-training275uss coleAbstain
25clueweb12-trec-web-2014-20230107-training276how has african american music influence historyFactual
26clueweb12-trec-web-2014-20230107-training277bewitched castAbstain
27clueweb12-trec-web-2014-20230107-training278mister rogersAbstain
28clueweb12-trec-web-2014-20230107-training279game theoryAbstain
29clueweb12-trec-web-2014-20230107-training280view my internet historyAbstain
30clueweb12-trec-web-2014-20230107-training281ketogenic dietAbstain
31clueweb12-trec-web-2014-20230107-training282nasa interplanetary missionsAbstain
32clueweb12-trec-web-2014-20230107-training283hayrides in paAbstain
33clueweb12-trec-web-2014-20230107-training284where to find morel mushroomsFactual
34clueweb12-trec-web-2014-20230107-training285magnesium rich foodsAbstain
35clueweb12-trec-web-2014-20230107-training286common schizophrenia drugsAbstain
36clueweb12-trec-web-2014-20230107-training287carotid cavernous fistula treatmentAbstain
37clueweb12-trec-web-2014-20230107-training288fidel castroAbstain
38clueweb12-trec-web-2014-20230107-training289benefits of yogaAbstain
39clueweb12-trec-web-2014-20230107-training290norway spruceAbstain
40clueweb12-trec-web-2014-20230107-training291sangre de cristo mountainsAbstain
41clueweb12-trec-web-2014-20230107-training292history of the electronic medical recordAbstain
42clueweb12-trec-web-2014-20230107-training293educational advantages of social networking sitesAbstain
43clueweb12-trec-web-2014-20230107-training294flowering plantsInstrumental
44clueweb12-trec-web-2014-20230107-training295how to tie a windsor knotInstrumental
45clueweb12-trec-web-2014-20230107-training296recycling lead acid batteriesInstrumental
46clueweb12-trec-web-2014-20230107-training297altitude sicknessAbstain
47clueweb12-trec-web-2014-20230107-training298medical care and jehovah s witnessesAbstain
48clueweb12-trec-web-2014-20230107-training299pink slime in ground beefAbstain
49clueweb12-trec-web-2014-20230107-training300how to find the meanInstrumental
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_clueweb11", + "summary": "{\n \"name\": \"df_clueweb11\",\n \"rows\": 50,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"clueweb12-trec-web-2014-20230107-training\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"264\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"tribe formerly living in alabama\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"intent_prediction\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Instrumental\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 133 + } + ], + "source": [ + "# including the url increased the proportion of navigational queries :)\n", + "\n", + "df_clueweb11 = df_all.loc[df_all[\"dataset\"] == \"clueweb12-trec-web-2014-20230107-training\"]\n", + "df_clueweb11\n", + "#df_clueweb11[\"intent_prediction\"].value_counts(normalize=True) * 100" + ] + }, + { + "cell_type": "code", + "source": [ + "df_trans = df_all.loc[df_all[\"intent_prediction\"] == \"Transactional\"]\n", + "df_trans" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "bE8IU5Hq-cJ_", + "outputId": "ec7afeaf-5782-4db8-e831-daf498c235e4" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " dataset qid \\\n", + "17 clueweb09-en-trec-web-2009-20230107-training 18 \n", + "15 clueweb09-en-trec-web-2010-20230107-training 66 \n", + "44 clueweb09-en-trec-web-2010-20230107-training 95 \n", + "4 gov-trec-web-2002-20230209-training 555 \n", + "\n", + " query intent_prediction \n", + "17 wedding budget calculator Transactional \n", + "15 income tax return online Transactional \n", + "44 earn money at home Transactional \n", + "4 criteria obtain u s visa Transactional " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasetqidqueryintent_prediction
17clueweb09-en-trec-web-2009-20230107-training18wedding budget calculatorTransactional
15clueweb09-en-trec-web-2010-20230107-training66income tax return onlineTransactional
44clueweb09-en-trec-web-2010-20230107-training95earn money at homeTransactional
4gov-trec-web-2002-20230209-training555criteria obtain u s visaTransactional
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_trans", + "summary": "{\n \"name\": \"df_trans\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"dataset\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"clueweb09-en-trec-web-2009-20230107-training\",\n \"clueweb09-en-trec-web-2010-20230107-training\",\n \"gov-trec-web-2002-20230209-training\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"66\",\n \"555\",\n \"18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"income tax return online\",\n \"criteria obtain u s visa\",\n \"wedding budget calculator\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"intent_prediction\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Transactional\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 127 + } + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file