Skip to content

Commit

Permalink
remove stemmer
Browse files Browse the repository at this point in the history
  • Loading branch information
MacuraFlo committed Jul 1, 2024
1 parent d6466f7 commit c6832fa
Show file tree
Hide file tree
Showing 2 changed files with 66,322 additions and 66,383 deletions.
139 changes: 39 additions & 100 deletions baseline-retrieval-system/baseline-retrieval-system.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,19 +36,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n",
"\n",
"No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n"
]
}
],
"outputs": [],
"source": [
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n",
"ensure_pyterrier_is_loaded()\n",
Expand All @@ -66,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -78,14 +68,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:03<00:00, 36806.83it/s]\n"
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 60669.18it/s]\n"
]
}
],
Expand All @@ -100,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -126,61 +116,61 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 70%|███████ | 88932/126958 [00:14<00:05, 6996.92it/s]"
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 70%|███████ | 88895/126958 [00:10<00:04, 7763.13it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"16:20:13.100 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (2020.mir_conference-2020.1) - further warnings are suppressed\n"
"08:26:41.986 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (2020.mir_conference-2020.1) - further warnings are suppressed\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:18<00:00, 6846.64it/s] \n"
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:14<00:00, 8881.11it/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"16:20:19.454 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents\n"
"08:26:47.521 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents\n"
]
}
],
"source": [
"indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer', meta={'docno': 75, 'text': 4096})\n",
"indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer=None, meta={'docno': 75, 'text': 4096})\n",
"\n",
"index_full_text = pt.IndexFactory.of(indexer.index(pt_dataset.get_corpus_iter()))\n",
"bm25 = pt.BatchRetrieve(index_full_text, wmodel=\"BM25\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16:25:26.463 [ForkJoinPool-17-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (D12-1116) - further warnings are suppressed\n",
"16:25:26.643 [ForkJoinPool-17-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents\n"
"08:26:52.024 [ForkJoinPool-3-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (D12-1116) - further warnings are suppressed\n",
"08:26:52.192 [ForkJoinPool-3-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents\n"
]
}
],
"source": [
"indexer = pt.IterDictIndexer(\"/tmp/index-EVENT\", overwrite=True, stemmer='PorterStemmer', meta={'docno': 75, 'text': 4096})\n",
"indexer = pt.IterDictIndexer(\"/tmp/index-EVENT\", overwrite=True, stemmer=None, meta={'docno': 75, 'text': 4096})\n",
"\n",
"index_event_text = pt.IndexFactory.of(indexer.index(retain_only_entity_type_text(set(['EVENT']))))\n",
"bm25_event = pt.BatchRetrieve(index_event_text, wmodel=\"BM25\")"
Expand All @@ -198,20 +188,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16:55:06.711 [ForkJoinPool-20-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (W11-4616) - further warnings are suppressed\n",
"16:55:10.988 [ForkJoinPool-20-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 202 empty documents\n"
"08:26:56.938 [ForkJoinPool-4-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (W11-4616) - further warnings are suppressed\n",
"08:27:00.338 [ForkJoinPool-4-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 202 empty documents\n"
]
}
],
"source": [
"indexer = pt.IterDictIndexer(\"/tmp/index-MULTIPLE_TYPES\", overwrite=True, stemmer='PorterStemmer', meta={'docno': 75, 'text': 4096})\n",
"indexer = pt.IterDictIndexer(\"/tmp/index-MULTIPLE_TYPES\", overwrite=True, stemmer=None, meta={'docno': 75, 'text': 4096})\n",
"\n",
"index_with_entities = pt.IndexFactory.of(indexer.index(retain_only_entity_type_text(set(['ORG', 'PERSON', 'WORK_OF_ART', 'NORP', 'PRODUCT', 'FAC', 'LOC', 'MONEY', 'EVENT', 'LAW']))))\n",
"bm25_ents = pt.BatchRetrieve(index_with_entities, wmodel=\"BM25\")"
Expand All @@ -228,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -237,84 +227,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.\n"
]
}
],
"source": [
"run = entity_pipe(pt_dataset.get_topics())"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>ndcg_cut_10</th>\n",
" <th>P_10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BR(BM25)</td>\n",
" <td>0.374041</td>\n",
" <td>0.332353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Sum(ScalarProd(BR(BM25), 0.99), ScalarProd(BR(...</td>\n",
" <td>0.375530</td>\n",
" <td>0.335294</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Sum(ScalarProd(BR(BM25), 0.98), ScalarProd(BR(...</td>\n",
" <td>0.369844</td>\n",
" <td>0.330882</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Sum(ScalarProd(BR(BM25), 0.97), ScalarProd(BR(...</td>\n",
" <td>0.362040</td>\n",
" <td>0.326471</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name ndcg_cut_10 P_10\n",
"0 BR(BM25) 0.374041 0.332353\n",
"1 Sum(ScalarProd(BR(BM25), 0.99), ScalarProd(BR(... 0.375530 0.335294\n",
"2 Sum(ScalarProd(BR(BM25), 0.98), ScalarProd(BR(... 0.369844 0.330882\n",
"3 Sum(ScalarProd(BR(BM25), 0.97), ScalarProd(BR(... 0.362040 0.326471"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n",
"Done. run file is stored under \"../runs/run.txt\".\n"
]
}
],
"source": [
Expand Down
Loading

0 comments on commit c6832fa

Please sign in to comment.