diff --git a/tutorials/README.md b/tutorials/README.md index bf0751a..2955a78 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -37,6 +37,7 @@ More complex topics that might not be suited to every IR course are still covere | Query Spelling Correction (work in progress)| [🔗](tutorial-spelling-correction.ipynb) | ⏳ | | Splade for Query Processing (work in progress)| [🔗](tutorial-splade-query-processing.ipynb) | ⏳ | | Splade for Document Processing (work in progress)| [🔗](tutorial-splade-ranking.ipynb) | ⏳ | +| DocT5Query (work in progress)| [🔗](tutorial-doc-t5-query.ipynb) | ⏳ | | Re-ranking with cross-encoders or bi-encoders ([work in progress](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3)) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | | _Anyhting missing? [Propose new tutorial.](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/new)_ | | | diff --git a/tutorials/tutorial-doc-t5-query.ipynb b/tutorials/tutorial-doc-t5-query.ipynb new file mode 100644 index 0000000..66e81a2 --- /dev/null +++ b/tutorials/tutorial-doc-t5-query.ipynb @@ -0,0 +1,641 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IR Lab Tutorial: Document Expansion with DocT5Query\n", + "\n", + "TBD..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import All Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n", + "\n", + "No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n" + ] + } + ], + "source": [ + "from tira.third_party_integrations import ensure_pyterrier_is_loaded\n", + "from tira.rest_api_client import Client\n", + "ensure_pyterrier_is_loaded()\n", + "import pandas as pd\n", + "import pyterrier as pt\n", + "from tqdm import tqdm\n", + "import gzip\n", + "import json\n", + "\n", + "tira = Client()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DocT5 Query on a corpus with already high recall\n", + "\n", + "TBD." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = 'antique-test-20230107-training'\n", + "pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset}')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidquery
03990512how can we get concentration onsomething
1714612why doesn t the water fall off earth if it s r...
22528767how do i determine the charge of the iron ion ...
\n", + "
" + ], + "text/plain": [ + " qid query\n", + "0 3990512 how can we get concentration onsomething\n", + "1 714612 why doesn t the water fall off earth if it s r...\n", + "2 2528767 how do i determine the charge of the iron ion ..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pt_dataset.get_topics('text').head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namerecall_1000
0BM250.788732
\n", + "
" + ], + "text/plain": [ + " name recall_1000\n", + "0 BM25 0.788732" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pt.Experiment(\n", + " retr_systems=[bm25],\n", + " topics=pt_dataset.get_topics('text'),\n", + " qrels=pt_dataset.get_qrels(),\n", + " names=['BM25'],\n", + " eval_metrics=['recall_1000']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def doc_t5_query(dataset):\n", + " docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'\n", + " with gzip.open(docs, 'rt') as f:\n", + " for l in tqdm(f):\n", + " l = json.loads(l)\n", + " l['text'] = l['querygen']\n", + " l['docno'] = l['doc_id']\n", + " del l['doc_id']\n", + " del l['querygen']\n", + " yield l\n", + "\n", + "def doc_t5_query_index(dataset):\n", + " indexer = pt.IterDictIndexer(\"/tmp/index2\", overwrite=True, meta={'docno': 100, 'text': 20480})\n", + " index_ref = indexer.index(doc_t5_query(dataset))\n", + " return pt.IndexFactory.of(index_ref)\n", + "\n", + "#index = doc_t5_query_index(dataset)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "index = pt.IndexFactory.of('/tmp/index2')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "188633it [00:08, 21509.20it/s]\n" + ] + } + ], + "source": [ + "docs_retrieved_by_bm25 = {}\n", + "\n", + "bm25_result = bm25(pt_dataset.get_topics('title'))\n", + "\n", + "for _, i in tqdm(bm25_result.iterrows()):\n", + " qid, docno = str(i['qid']), str(i['docno'])\n", + "\n", + " if qid not in docs_retrieved_by_bm25:\n", + " docs_retrieved_by_bm25[qid] = set()\n", + " \n", + " docs_retrieved_by_bm25[qid].add(docno)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]\n", + "omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "bm25_doct5query = pt.BatchRetrieve(index, wmodel=\"BM25\")\n", + "\n", + "bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namerecall_1000
0DocT5Query >> BM250.534685
1DocT5Query w.o. BM25 >> BM250.019399
\n", + "
" + ], + "text/plain": [ + " name recall_1000\n", + "0 DocT5Query >> BM25 0.534685\n", + "1 DocT5Query w.o. BM25 >> BM25 0.019399" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pt.Experiment(\n", + " retr_systems=[bm25_doct5query, bm25_doct5query_new],\n", + " topics=pt_dataset.get_topics('text'),\n", + " qrels=pt_dataset.get_qrels(),\n", + " names=['DocT5Query >> BM25', 'DocT5Query w.o. BM25 >> BM25'],\n", + " eval_metrics=['recall_1000']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DocT5 Query on a corpus with lower recall\n", + "\n", + "TBD" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = 'longeval-train-20230513-training'\n", + "pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset}')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namerecall_1000
0BM250.496595
\n", + "
" + ], + "text/plain": [ + " name recall_1000\n", + "0 BM25 0.496595" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pt.Experiment(\n", + " retr_systems=[bm25],\n", + " topics=pt_dataset.get_topics('text'),\n", + " qrels=pt_dataset.get_qrels(),\n", + " names=['BM25'],\n", + " eval_metrics=['recall_1000']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/doc-t5-query/2024-03-19-19-46-01.zip\n", + "\tThis is only used for last spot checks before archival to Zenodo.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Download: 100%|██████████| 60.8M/60.8M [00:01<00:00, 38.8MiB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download finished. Extract...\n", + "Extraction finished: /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/seanmacavaney\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "16164it [00:07, 5062.00it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11:34:18.676 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (doc062207101085) - further warnings are suppressed\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1570734it [03:13, 8136.38it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11:37:49.947 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 110 empty documents\n" + ] + } + ], + "source": [ + "index = doc_t5_query_index(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "bm25_doct5query = pt.BatchRetrieve(index, wmodel=\"BM25\") % 100" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "0it [00:00, ?it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "66567it [00:04, 13604.33it/s]\n" + ] + } + ], + "source": [ + "docs_retrieved_by_bm25 = {}\n", + "\n", + "bm25_result = bm25(pt_dataset.get_topics('title'))\n", + "\n", + "for _, i in tqdm(bm25_result.iterrows()):\n", + " qid, docno = str(i['qid']), str(i['docno'])\n", + "\n", + " if qid not in docs_retrieved_by_bm25:\n", + " docs_retrieved_by_bm25[qid] = set()\n", + " \n", + " docs_retrieved_by_bm25[qid].add(docno)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]\n", + "omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)\n", + "\n", + "bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namerecall_1000
0DocT5Query >> BM250.350889
1DocT5Query w.o. BM25 >> BM250.066767
\n", + "
" + ], + "text/plain": [ + " name recall_1000\n", + "0 DocT5Query >> BM25 0.350889\n", + "1 DocT5Query w.o. BM25 >> BM25 0.066767" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pt.Experiment(\n", + " retr_systems=[bm25_doct5query, bm25_doct5query_new],\n", + " topics=pt_dataset.get_topics('text'),\n", + " qrels=pt_dataset.get_qrels(),\n", + " names=['DocT5Query >> BM25', 'DocT5Query w.o. BM25 >> BM25'],\n", + " eval_metrics=['recall_1000']\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}