diff --git a/tutorials/README.md b/tutorials/README.md
index bf0751a..2955a78 100644
--- a/tutorials/README.md
+++ b/tutorials/README.md
@@ -37,6 +37,7 @@ More complex topics that might not be suited to every IR course are still covere
| Query Spelling Correction (work in progress)| [🔗](tutorial-spelling-correction.ipynb) | ⏳ |
| Splade for Query Processing (work in progress)| [🔗](tutorial-splade-query-processing.ipynb) | ⏳ |
| Splade for Document Processing (work in progress)| [🔗](tutorial-splade-ranking.ipynb) | ⏳ |
+| DocT5Query (work in progress)| [🔗](tutorial-doc-t5-query.ipynb) | ⏳ |
| Re-ranking with cross-encoders or bi-encoders ([work in progress](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3)) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) | [⏳](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/3) |
| _Anyhting missing? [Propose new tutorial.](https://github.com/tira-io/teaching-ir-with-shared-tasks/issues/new)_ | | |
diff --git a/tutorials/tutorial-doc-t5-query.ipynb b/tutorials/tutorial-doc-t5-query.ipynb
new file mode 100644
index 0000000..66e81a2
--- /dev/null
+++ b/tutorials/tutorial-doc-t5-query.ipynb
@@ -0,0 +1,641 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# IR Lab Tutorial: Document Expansion with DocT5Query\n",
+ "\n",
+ "TBD..."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Import All Libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n",
+ "\n",
+ "No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n"
+ ]
+ }
+ ],
+ "source": [
+ "from tira.third_party_integrations import ensure_pyterrier_is_loaded\n",
+ "from tira.rest_api_client import Client\n",
+ "ensure_pyterrier_is_loaded()\n",
+ "import pandas as pd\n",
+ "import pyterrier as pt\n",
+ "from tqdm import tqdm\n",
+ "import gzip\n",
+ "import json\n",
+ "\n",
+ "tira = Client()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## DocT5 Query on a corpus with already high recall\n",
+ "\n",
+ "TBD."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = 'antique-test-20230107-training'\n",
+ "pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " qid | \n",
+ " query | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3990512 | \n",
+ " how can we get concentration onsomething | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 714612 | \n",
+ " why doesn t the water fall off earth if it s r... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2528767 | \n",
+ " how do i determine the charge of the iron ion ... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " qid query\n",
+ "0 3990512 how can we get concentration onsomething\n",
+ "1 714612 why doesn t the water fall off earth if it s r...\n",
+ "2 2528767 how do i determine the charge of the iron ion ..."
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pt_dataset.get_topics('text').head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " recall_1000 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " BM25 | \n",
+ " 0.788732 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name recall_1000\n",
+ "0 BM25 0.788732"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pt.Experiment(\n",
+ " retr_systems=[bm25],\n",
+ " topics=pt_dataset.get_topics('text'),\n",
+ " qrels=pt_dataset.get_qrels(),\n",
+ " names=['BM25'],\n",
+ " eval_metrics=['recall_1000']\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def doc_t5_query(dataset):\n",
+ " docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'\n",
+ " with gzip.open(docs, 'rt') as f:\n",
+ " for l in tqdm(f):\n",
+ " l = json.loads(l)\n",
+ " l['text'] = l['querygen']\n",
+ " l['docno'] = l['doc_id']\n",
+ " del l['doc_id']\n",
+ " del l['querygen']\n",
+ " yield l\n",
+ "\n",
+ "def doc_t5_query_index(dataset):\n",
+ " indexer = pt.IterDictIndexer(\"/tmp/index2\", overwrite=True, meta={'docno': 100, 'text': 20480})\n",
+ " index_ref = indexer.index(doc_t5_query(dataset))\n",
+ " return pt.IndexFactory.of(index_ref)\n",
+ "\n",
+ "#index = doc_t5_query_index(dataset)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "index = pt.IndexFactory.of('/tmp/index2')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "188633it [00:08, 21509.20it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "docs_retrieved_by_bm25 = {}\n",
+ "\n",
+ "bm25_result = bm25(pt_dataset.get_topics('title'))\n",
+ "\n",
+ "for _, i in tqdm(bm25_result.iterrows()):\n",
+ " qid, docno = str(i['qid']), str(i['docno'])\n",
+ "\n",
+ " if qid not in docs_retrieved_by_bm25:\n",
+ " docs_retrieved_by_bm25[qid] = set()\n",
+ " \n",
+ " docs_retrieved_by_bm25[qid].add(docno)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]\n",
+ "omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bm25_doct5query = pt.BatchRetrieve(index, wmodel=\"BM25\")\n",
+ "\n",
+ "bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " recall_1000 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DocT5Query >> BM25 | \n",
+ " 0.534685 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DocT5Query w.o. BM25 >> BM25 | \n",
+ " 0.019399 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name recall_1000\n",
+ "0 DocT5Query >> BM25 0.534685\n",
+ "1 DocT5Query w.o. BM25 >> BM25 0.019399"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pt.Experiment(\n",
+ " retr_systems=[bm25_doct5query, bm25_doct5query_new],\n",
+ " topics=pt_dataset.get_topics('text'),\n",
+ " qrels=pt_dataset.get_qrels(),\n",
+ " names=['DocT5Query >> BM25', 'DocT5Query w.o. BM25 >> BM25'],\n",
+ " eval_metrics=['recall_1000']\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## DocT5 Query on a corpus with lower recall\n",
+ "\n",
+ "TBD"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = 'longeval-train-20230513-training'\n",
+ "pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bm25 = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " recall_1000 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " BM25 | \n",
+ " 0.496595 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name recall_1000\n",
+ "0 BM25 0.496595"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pt.Experiment(\n",
+ " retr_systems=[bm25],\n",
+ " topics=pt_dataset.get_topics('text'),\n",
+ " qrels=pt_dataset.get_qrels(),\n",
+ " names=['BM25'],\n",
+ " eval_metrics=['recall_1000']\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/doc-t5-query/2024-03-19-19-46-01.zip\n",
+ "\tThis is only used for last spot checks before archival to Zenodo.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Download: 100%|██████████| 60.8M/60.8M [00:01<00:00, 38.8MiB/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Download finished. Extract...\n",
+ "Extraction finished: /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/seanmacavaney\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "16164it [00:07, 5062.00it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11:34:18.676 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (doc062207101085) - further warnings are suppressed\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "1570734it [03:13, 8136.38it/s] \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11:37:49.947 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 110 empty documents\n"
+ ]
+ }
+ ],
+ "source": [
+ "index = doc_t5_query_index(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bm25_doct5query = pt.BatchRetrieve(index, wmodel=\"BM25\") % 100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "0it [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "66567it [00:04, 13604.33it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "docs_retrieved_by_bm25 = {}\n",
+ "\n",
+ "bm25_result = bm25(pt_dataset.get_topics('title'))\n",
+ "\n",
+ "for _, i in tqdm(bm25_result.iterrows()):\n",
+ " qid, docno = str(i['qid']), str(i['docno'])\n",
+ "\n",
+ " if qid not in docs_retrieved_by_bm25:\n",
+ " docs_retrieved_by_bm25[qid] = set()\n",
+ " \n",
+ " docs_retrieved_by_bm25[qid].add(docno)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]\n",
+ "omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)\n",
+ "\n",
+ "bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " recall_1000 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DocT5Query >> BM25 | \n",
+ " 0.350889 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DocT5Query w.o. BM25 >> BM25 | \n",
+ " 0.066767 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name recall_1000\n",
+ "0 DocT5Query >> BM25 0.350889\n",
+ "1 DocT5Query w.o. BM25 >> BM25 0.066767"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pt.Experiment(\n",
+ " retr_systems=[bm25_doct5query, bm25_doct5query_new],\n",
+ " topics=pt_dataset.get_topics('text'),\n",
+ " qrels=pt_dataset.get_qrels(),\n",
+ " names=['DocT5Query >> BM25', 'DocT5Query w.o. BM25 >> BM25'],\n",
+ " eval_metrics=['recall_1000']\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}