diff --git a/tutorials/README.md b/tutorials/README.md index 8f2bbf4..687999e 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -39,6 +39,7 @@ More complex topics that might not be suited to every IR course are still covere | Splade for Document Processing (work in progress)| [🔗](tutorial-splade-ranking.ipynb) | ⏳ | | DocT5Query (work in progress)| [🔗](tutorial-doc-t5-query.ipynb) | ⏳ | | Genre Classification (work in progress)| [🔗](tutorial-genre-classification.ipynb) | ⏳ | +| Corpus Graph (work in progress)| [🔗](tutorial-corpus-graph.ipynb) | ⏳ | | Re-ranking with cross-encoders or bi-encoders ([work in progress](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3)) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | | _Anyhting missing? [Propose new tutorial.](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/new)_ | | | diff --git a/tutorials/tutorial-corpus-graph.ipynb b/tutorials/tutorial-corpus-graph.ipynb index ce72581..cdaf7ac 100644 --- a/tutorials/tutorial-corpus-graph.ipynb +++ b/tutorials/tutorial-corpus-graph.ipynb @@ -18,18 +18,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from tira.third_party_integrations import ensure_pyterrier_is_loaded\n", "from tira.rest_api_client import Client\n", "ensure_pyterrier_is_loaded()\n", - "import pandas as pd\n", "import pyterrier as pt\n", - "from tqdm import tqdm\n", - "import gzip\n", - "import json\n", "\n", "tira = Client()" ] @@ -688,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -699,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -829,7 +825,7 @@ "[9656 rows x 4 columns]" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -842,24 +838,757 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", - "\u001b[1;31mClick here for more info. \n", - "\u001b[1;31mView Jupyter log for further details." + "name": "stdout", + "output_type": "stream", + "text": [ + "Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/corpus-graph/2024-03-21-12-46-50.zip\n", + "\tThis is only used for last spot checks before archival to Zenodo.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Download: 100%|██████████| 93.2M/93.2M [00:09<00:00, 10.8MiB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download finished. Extract...\n", + "Extraction finished: /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/seanmacavaney\n" ] } ], "source": [ "corpus_graph = tira.pt.transform_documents('ir-benchmarks/seanmacavaney/corpus-graph', 'longeval-train-20230513-training') " ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "qrels_dict = {}\n", + "\n", + "for _, i in qrels.iterrows():\n", + " if i['qid'] not in qrels_dict:\n", + " qrels_dict[i['qid']] = {}\n", + " qrels_dict[i['qid']][i['docno']] = i['label']\n", + "\n", + "def count_relevant_neighbours(entry, relevance_level=1):\n", + " ret = 0\n", + " for neighbor in entry['neighbors']:\n", + " if qrels_dict[entry['qid']].get(neighbor, 0) >= relevance_level:\n", + " ret += 1\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | qid | \n", + "docno | \n", + "label | \n", + "iteration | \n", + "neighbors | \n", + "
---|---|---|---|---|---|
3 | \n", + "q06223196 | \n", + "doc062200204465 | \n", + "1 | \n", + "0 | \n", + "[doc062200401429, doc062200106171, doc06221110... | \n", + "
10 | \n", + "q06223196 | \n", + "doc062200205493 | \n", + "1 | \n", + "0 | \n", + "[doc062201708464, doc062200115614, doc06220500... | \n", + "
20 | \n", + "q062228 | \n", + "doc062200116555 | \n", + "1 | \n", + "0 | \n", + "[doc062208807613, doc062208706096, doc06221040... | \n", + "
21 | \n", + "q062228 | \n", + "doc062200116273 | \n", + "2 | \n", + "0 | \n", + "[doc062200100875, doc062201300294, doc06220380... | \n", + "
25 | \n", + "q062287 | \n", + "doc062200209981 | \n", + "1 | \n", + "0 | \n", + "[doc062200116769, doc062215804698, doc06220010... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
9636 | \n", + "q062225194 | \n", + "doc062200201379 | \n", + "2 | \n", + "0 | \n", + "[doc062208607784, doc062200116561, doc06220830... | \n", + "
9640 | \n", + "q062225194 | \n", + "doc062200205011 | \n", + "1 | \n", + "0 | \n", + "[doc062208406169, doc062208603880, doc06220860... | \n", + "
9641 | \n", + "q062225194 | \n", + "doc062200204433 | \n", + "1 | \n", + "0 | \n", + "[doc062200112015, doc062200113353, doc06220841... | \n", + "
9647 | \n", + "q062225197 | \n", + "doc062200207538 | \n", + "1 | \n", + "0 | \n", + "[doc062202104464, doc062201710012, doc06220200... | \n", + "
9652 | \n", + "q062225197 | \n", + "doc062200107121 | \n", + "1 | \n", + "0 | \n", + "[doc062201902653, doc062200117385, doc06220880... | \n", + "
2626 rows × 5 columns
\n", + "\n", + " | qid | \n", + "docno | \n", + "label | \n", + "iteration | \n", + "neighbors | \n", + "relevant_neighbors | \n", + "
---|---|---|---|---|---|---|
3 | \n", + "q06223196 | \n", + "doc062200204465 | \n", + "1 | \n", + "0 | \n", + "[doc062200401429, doc062200106171, doc06221110... | \n", + "0 | \n", + "
10 | \n", + "q06223196 | \n", + "doc062200205493 | \n", + "1 | \n", + "0 | \n", + "[doc062201708464, doc062200115614, doc06220500... | \n", + "0 | \n", + "
20 | \n", + "q062228 | \n", + "doc062200116555 | \n", + "1 | \n", + "0 | \n", + "[doc062208807613, doc062208706096, doc06221040... | \n", + "0 | \n", + "
21 | \n", + "q062228 | \n", + "doc062200116273 | \n", + "2 | \n", + "0 | \n", + "[doc062200100875, doc062201300294, doc06220380... | \n", + "0 | \n", + "
25 | \n", + "q062287 | \n", + "doc062200209981 | \n", + "1 | \n", + "0 | \n", + "[doc062200116769, doc062215804698, doc06220010... | \n", + "0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
9636 | \n", + "q062225194 | \n", + "doc062200201379 | \n", + "2 | \n", + "0 | \n", + "[doc062208607784, doc062200116561, doc06220830... | \n", + "0 | \n", + "
9640 | \n", + "q062225194 | \n", + "doc062200205011 | \n", + "1 | \n", + "0 | \n", + "[doc062208406169, doc062208603880, doc06220860... | \n", + "0 | \n", + "
9641 | \n", + "q062225194 | \n", + "doc062200204433 | \n", + "1 | \n", + "0 | \n", + "[doc062200112015, doc062200113353, doc06220841... | \n", + "1 | \n", + "
9647 | \n", + "q062225197 | \n", + "doc062200207538 | \n", + "1 | \n", + "0 | \n", + "[doc062202104464, doc062201710012, doc06220200... | \n", + "0 | \n", + "
9652 | \n", + "q062225197 | \n", + "doc062200107121 | \n", + "1 | \n", + "0 | \n", + "[doc062201902653, doc062200117385, doc06220880... | \n", + "0 | \n", + "
2626 rows × 6 columns
\n", + "\n", + " | qid | \n", + "query | \n", + "q0 | \n", + "rank | \n", + "score | \n", + "system | \n", + "docno | \n", + "tira_task | \n", + "tira_dataset | \n", + "tira_first_stage_run_id | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "q06223196 | \n", + "car shelter | \n", + "0 | \n", + "1 | \n", + "-0.003520 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062201708464 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
1 | \n", + "q06223196 | \n", + "car shelter | \n", + "0 | \n", + "2 | \n", + "-0.005353 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062200108613 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
2 | \n", + "q06223196 | \n", + "car shelter | \n", + "0 | \n", + "3 | \n", + "-0.006328 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062200206319 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
3 | \n", + "q06223196 | \n", + "car shelter | \n", + "0 | \n", + "4 | \n", + "-0.006333 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062200112743 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
4 | \n", + "q06223196 | \n", + "car shelter | \n", + "0 | \n", + "5 | \n", + "-0.006599 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062201708471 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
66562 | \n", + "q062225197 | \n", + "cheapest car | \n", + "0 | \n", + "96 | \n", + "-10.073474 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062202202627 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
66563 | \n", + "q062225197 | \n", + "cheapest car | \n", + "0 | \n", + "97 | \n", + "-10.074920 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062202005382 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
66564 | \n", + "q062225197 | \n", + "cheapest car | \n", + "0 | \n", + "98 | \n", + "-10.288811 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062202002893 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
66565 | \n", + "q062225197 | \n", + "cheapest car | \n", + "0 | \n", + "99 | \n", + "-10.347551 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062214906085 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
66566 | \n", + "q062225197 | \n", + "cheapest car | \n", + "0 | \n", + "100 | \n", + "-10.347551 | \n", + "castorini/monot5-base-msmarco-10k | \n", + "doc062214903805 | \n", + "ir-benchmarks | \n", + "longeval-train-20230513-training | \n", + "2024-03-18-12-56-01 | \n", + "
66567 rows × 10 columns
\n", + "