diff --git a/tutorials/README.md b/tutorials/README.md index 8f2bbf4..687999e 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -39,6 +39,7 @@ More complex topics that might not be suited to every IR course are still covere | Splade for Document Processing (work in progress)| [🔗](tutorial-splade-ranking.ipynb) | ⏳ | | DocT5Query (work in progress)| [🔗](tutorial-doc-t5-query.ipynb) | ⏳ | | Genre Classification (work in progress)| [🔗](tutorial-genre-classification.ipynb) | ⏳ | +| Corpus Graph (work in progress)| [🔗](tutorial-corpus-graph.ipynb) | ⏳ | | Re-ranking with cross-encoders or bi-encoders ([work in progress](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3)) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | | _Anyhting missing? [Propose new tutorial.](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/new)_ | | | diff --git a/tutorials/tutorial-corpus-graph.ipynb b/tutorials/tutorial-corpus-graph.ipynb index ce72581..cdaf7ac 100644 --- a/tutorials/tutorial-corpus-graph.ipynb +++ b/tutorials/tutorial-corpus-graph.ipynb @@ -18,18 +18,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from tira.third_party_integrations import ensure_pyterrier_is_loaded\n", "from tira.rest_api_client import Client\n", "ensure_pyterrier_is_loaded()\n", - "import pandas as pd\n", "import pyterrier as pt\n", - "from tqdm import tqdm\n", - "import gzip\n", - "import json\n", "\n", "tira = Client()" ] @@ -688,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -699,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -829,7 +825,7 @@ "[9656 rows x 4 columns]" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -842,24 +838,757 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", - "\u001b[1;31mClick here for more info. \n", - "\u001b[1;31mView Jupyter log for further details." + "name": "stdout", + "output_type": "stream", + "text": [ + "Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/corpus-graph/2024-03-21-12-46-50.zip\n", + "\tThis is only used for last spot checks before archival to Zenodo.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Download: 100%|██████████| 93.2M/93.2M [00:09<00:00, 10.8MiB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download finished. Extract...\n", + "Extraction finished: /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/seanmacavaney\n" ] } ], "source": [ "corpus_graph = tira.pt.transform_documents('ir-benchmarks/seanmacavaney/corpus-graph', 'longeval-train-20230513-training') " ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "qrels_dict = {}\n", + "\n", + "for _, i in qrels.iterrows():\n", + " if i['qid'] not in qrels_dict:\n", + " qrels_dict[i['qid']] = {}\n", + " qrels_dict[i['qid']][i['docno']] = i['label']\n", + "\n", + "def count_relevant_neighbours(entry, relevance_level=1):\n", + " ret = 0\n", + " for neighbor in entry['neighbors']:\n", + " if qrels_dict[entry['qid']].get(neighbor, 0) >= relevance_level:\n", + " ret += 1\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qiddocnolabeliterationneighbors
3q06223196doc06220020446510[doc062200401429, doc062200106171, doc06221110...
10q06223196doc06220020549310[doc062201708464, doc062200115614, doc06220500...
20q062228doc06220011655510[doc062208807613, doc062208706096, doc06221040...
21q062228doc06220011627320[doc062200100875, doc062201300294, doc06220380...
25q062287doc06220020998110[doc062200116769, doc062215804698, doc06220010...
..................
9636q062225194doc06220020137920[doc062208607784, doc062200116561, doc06220830...
9640q062225194doc06220020501110[doc062208406169, doc062208603880, doc06220860...
9641q062225194doc06220020443310[doc062200112015, doc062200113353, doc06220841...
9647q062225197doc06220020753810[doc062202104464, doc062201710012, doc06220200...
9652q062225197doc06220010712110[doc062201902653, doc062200117385, doc06220880...
\n", + "

2626 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " qid docno label iteration \\\n", + "3 q06223196 doc062200204465 1 0 \n", + "10 q06223196 doc062200205493 1 0 \n", + "20 q062228 doc062200116555 1 0 \n", + "21 q062228 doc062200116273 2 0 \n", + "25 q062287 doc062200209981 1 0 \n", + "... ... ... ... ... \n", + "9636 q062225194 doc062200201379 2 0 \n", + "9640 q062225194 doc062200205011 1 0 \n", + "9641 q062225194 doc062200204433 1 0 \n", + "9647 q062225197 doc062200207538 1 0 \n", + "9652 q062225197 doc062200107121 1 0 \n", + "\n", + " neighbors \n", + "3 [doc062200401429, doc062200106171, doc06221110... \n", + "10 [doc062201708464, doc062200115614, doc06220500... \n", + "20 [doc062208807613, doc062208706096, doc06221040... \n", + "21 [doc062200100875, doc062201300294, doc06220380... \n", + "25 [doc062200116769, doc062215804698, doc06220010... \n", + "... ... \n", + "9636 [doc062208607784, doc062200116561, doc06220830... \n", + "9640 [doc062208406169, doc062208603880, doc06220860... \n", + "9641 [doc062200112015, doc062200113353, doc06220841... \n", + "9647 [doc062202104464, doc062201710012, doc06220200... \n", + "9652 [doc062201902653, doc062200117385, doc06220880... \n", + "\n", + "[2626 rows x 5 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if there is no neighbour, we have no entry in the corpus graph (maybe should allow default values in transform documents\n", + "qrels_with_neighbors = corpus_graph(qrels[qrels['label'] > 0].copy())\n", + "qrels_with_neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qiddocnolabeliterationneighborsrelevant_neighbors
3q06223196doc06220020446510[doc062200401429, doc062200106171, doc06221110...0
10q06223196doc06220020549310[doc062201708464, doc062200115614, doc06220500...0
20q062228doc06220011655510[doc062208807613, doc062208706096, doc06221040...0
21q062228doc06220011627320[doc062200100875, doc062201300294, doc06220380...0
25q062287doc06220020998110[doc062200116769, doc062215804698, doc06220010...0
.....................
9636q062225194doc06220020137920[doc062208607784, doc062200116561, doc06220830...0
9640q062225194doc06220020501110[doc062208406169, doc062208603880, doc06220860...0
9641q062225194doc06220020443310[doc062200112015, doc062200113353, doc06220841...1
9647q062225197doc06220020753810[doc062202104464, doc062201710012, doc06220200...0
9652q062225197doc06220010712110[doc062201902653, doc062200117385, doc06220880...0
\n", + "

2626 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " qid docno label iteration \\\n", + "3 q06223196 doc062200204465 1 0 \n", + "10 q06223196 doc062200205493 1 0 \n", + "20 q062228 doc062200116555 1 0 \n", + "21 q062228 doc062200116273 2 0 \n", + "25 q062287 doc062200209981 1 0 \n", + "... ... ... ... ... \n", + "9636 q062225194 doc062200201379 2 0 \n", + "9640 q062225194 doc062200205011 1 0 \n", + "9641 q062225194 doc062200204433 1 0 \n", + "9647 q062225197 doc062200207538 1 0 \n", + "9652 q062225197 doc062200107121 1 0 \n", + "\n", + " neighbors relevant_neighbors \n", + "3 [doc062200401429, doc062200106171, doc06221110... 0 \n", + "10 [doc062201708464, doc062200115614, doc06220500... 0 \n", + "20 [doc062208807613, doc062208706096, doc06221040... 0 \n", + "21 [doc062200100875, doc062201300294, doc06220380... 0 \n", + "25 [doc062200116769, doc062215804698, doc06220010... 0 \n", + "... ... ... \n", + "9636 [doc062208607784, doc062200116561, doc06220830... 0 \n", + "9640 [doc062208406169, doc062208603880, doc06220860... 0 \n", + "9641 [doc062200112015, doc062200113353, doc06220841... 1 \n", + "9647 [doc062202104464, doc062201710012, doc06220200... 0 \n", + "9652 [doc062201902653, doc062200117385, doc06220880... 0 \n", + "\n", + "[2626 rows x 6 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qrels_with_neighbors['relevant_neighbors'] = qrels_with_neighbors.apply(count_relevant_neighbours, axis=1)\n", + "qrels_with_neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "656" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "len(qrels_with_neighbors['qid'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4.009160305343512" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# On average 4 relevant documents per query\n", + "2626/655" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 2626.000000\n", + "mean 0.610815\n", + "std 0.941377\n", + "min 0.000000\n", + "25% 0.000000\n", + "50% 0.000000\n", + "75% 1.000000\n", + "max 7.000000\n", + "Name: relevant_neighbors, dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# on average 0.6 relevant document retrieved via corpus graph for docs in the qrels\n", + "# Reasonable, as there are only 4 relevant documents per query on average, but if you find one, you are likely to find \"0.6 more\" via the corpus graph, which is especially reasonable as you can aggregate this over multiple top results per query.\n", + "qrels_with_neighbors['relevant_neighbors'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Download: 1.11MiB [00:00, 7.31MiB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download finished. Extract...\n", + "Extraction finished: /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/tira-ir-starter\n" + ] + } + ], + "source": [ + "#Usually, you apply it to some tranformer-based model...\n", + "monot5 = tira.pt.from_submission(\"ir-benchmarks/tira-ir-starter/MonoT5 Base (tira-ir-starter-gygaggle)\", \"longeval-train-20230513-training\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidqueryq0rankscoresystemdocnotira_tasktira_datasettira_first_stage_run_id
0q06223196car shelter01-0.003520castorini/monot5-base-msmarco-10kdoc062201708464ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
1q06223196car shelter02-0.005353castorini/monot5-base-msmarco-10kdoc062200108613ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
2q06223196car shelter03-0.006328castorini/monot5-base-msmarco-10kdoc062200206319ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
3q06223196car shelter04-0.006333castorini/monot5-base-msmarco-10kdoc062200112743ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
4q06223196car shelter05-0.006599castorini/monot5-base-msmarco-10kdoc062201708471ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
.................................
66562q062225197cheapest car096-10.073474castorini/monot5-base-msmarco-10kdoc062202202627ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
66563q062225197cheapest car097-10.074920castorini/monot5-base-msmarco-10kdoc062202005382ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
66564q062225197cheapest car098-10.288811castorini/monot5-base-msmarco-10kdoc062202002893ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
66565q062225197cheapest car099-10.347551castorini/monot5-base-msmarco-10kdoc062214906085ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
66566q062225197cheapest car0100-10.347551castorini/monot5-base-msmarco-10kdoc062214903805ir-benchmarkslongeval-train-20230513-training2024-03-18-12-56-01
\n", + "

66567 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " qid query q0 rank score \\\n", + "0 q06223196 car shelter 0 1 -0.003520 \n", + "1 q06223196 car shelter 0 2 -0.005353 \n", + "2 q06223196 car shelter 0 3 -0.006328 \n", + "3 q06223196 car shelter 0 4 -0.006333 \n", + "4 q06223196 car shelter 0 5 -0.006599 \n", + "... ... ... .. ... ... \n", + "66562 q062225197 cheapest car 0 96 -10.073474 \n", + "66563 q062225197 cheapest car 0 97 -10.074920 \n", + "66564 q062225197 cheapest car 0 98 -10.288811 \n", + "66565 q062225197 cheapest car 0 99 -10.347551 \n", + "66566 q062225197 cheapest car 0 100 -10.347551 \n", + "\n", + " system docno tira_task \\\n", + "0 castorini/monot5-base-msmarco-10k doc062201708464 ir-benchmarks \n", + "1 castorini/monot5-base-msmarco-10k doc062200108613 ir-benchmarks \n", + "2 castorini/monot5-base-msmarco-10k doc062200206319 ir-benchmarks \n", + "3 castorini/monot5-base-msmarco-10k doc062200112743 ir-benchmarks \n", + "4 castorini/monot5-base-msmarco-10k doc062201708471 ir-benchmarks \n", + "... ... ... ... \n", + "66562 castorini/monot5-base-msmarco-10k doc062202202627 ir-benchmarks \n", + "66563 castorini/monot5-base-msmarco-10k doc062202005382 ir-benchmarks \n", + "66564 castorini/monot5-base-msmarco-10k doc062202002893 ir-benchmarks \n", + "66565 castorini/monot5-base-msmarco-10k doc062214906085 ir-benchmarks \n", + "66566 castorini/monot5-base-msmarco-10k doc062214903805 ir-benchmarks \n", + "\n", + " tira_dataset tira_first_stage_run_id \n", + "0 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "1 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "2 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "3 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "4 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "... ... ... \n", + "66562 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "66563 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "66564 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "66565 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "66566 longeval-train-20230513-training 2024-03-18-12-56-01 \n", + "\n", + "[66567 rows x 10 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "monot5(dataset.get_topics('title'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ToDo: now integrate the adaptive re-ranking..." + ] } ], "metadata": {