Skip to content

Commit

Permalink
Add Elasticsearch PubMed full text extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Jun 24, 2024
1 parent 511cf95 commit 643c672
Show file tree
Hide file tree
Showing 6 changed files with 341 additions and 150 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ Webis at TREC 2024 BioGen.

## Installation

1. Install [Python 3.10](https://python.org/downloads/).
1. Install [Python 3.11](https://python.org/downloads/).
2. Create and activate a virtual environment:

```shell
python3.10 -m venv venv/
python3.11 -m venv venv/
source venv/bin/activate
```

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ authors = [
]
description = "Webis at TREC 2024 BioGen."
readme = "README.md"
requires-python = ">=3.10"
requires-python = ">=3.11"
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Operating System :: OS Independent",
Expand Down
23 changes: 20 additions & 3 deletions trec_biogen/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,23 @@ def cli() -> None:


@cli.command()
def index_pubmed_full_texts() -> None:
from trec_biogen.pubmed_fulltexts import index_pubmed_full_texts
index_pubmed_full_texts()
@option(
"--dry-run/",
type=bool,
default=False,
)
@option(
"--refetch/",
type=bool,
default=False,
)
def index_pubmed_full_texts(
dry_run: bool = False,
refetch: bool = False,
) -> None:
from asyncio import run
from trec_biogen.pubmed_fulltexts import default_index_pubmed_full_texts
run(default_index_pubmed_full_texts(
dry_run=dry_run,
refetch=refetch,
))
25 changes: 24 additions & 1 deletion trec_biogen/elasticsearch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

from os import environ
from elasticsearch7 import Elasticsearch
from elasticsearch7 import AsyncElasticsearch, Elasticsearch


def elasticsearch_connection() -> Elasticsearch:
Expand All @@ -25,3 +25,26 @@ def elasticsearch_connection() -> Elasticsearch:
read_timeout=60,
max_retries=10,
)

def async_elasticsearch_connection() -> AsyncElasticsearch:
elasticsearch_url: str = environ["ELASTICSEARCH_URL"]
elasticsearch_username: str | None = environ.get("ELASTICSEARCH_USERNAME")
elasticsearch_password: str | None = environ.get("ELASTICSEARCH_PASSWORD")

elasticsearch_auth: tuple[str, str] | None
if elasticsearch_username is not None and elasticsearch_password is None:
raise ValueError("Must provide both username and password or neither.")
elif elasticsearch_password is not None and elasticsearch_username is None:
raise ValueError("Must provide both password and username or neither.")
elif elasticsearch_username is not None and elasticsearch_password is not None:
elasticsearch_auth = (elasticsearch_username, elasticsearch_password)
else:
elasticsearch_auth = None

return AsyncElasticsearch(
hosts=elasticsearch_url,
http_auth=elasticsearch_auth,
request_timeout=60,
read_timeout=60,
max_retries=10,
)
15 changes: 6 additions & 9 deletions trec_biogen/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class MeshTerm(InnerDoc):

class Article(Document):
class Index:
name = "corpus_pubmed_2024"
settings = {
"number_of_shards": 3,
"number_of_replicas": 2,
Expand Down Expand Up @@ -87,6 +88,11 @@ class Index:
"""List of languages."""
source_file: str = Keyword(required=True) # type: ignore
"""Basename of the XML file that contains this article."""
full_text: str | None = Text() # type: ignore
"""Extracted full text of the article."""
last_fetched_full_text: datetime | None = Date(
default_timezone="UTC") # type: ignore
"""Last date at which the full text has been extracted."""

@property
def pubmed_url(self) -> str:
Expand All @@ -105,12 +111,3 @@ def doi_url(self) -> str | None:
return f"https://doi.org/{self.doi}"


class FullTextArticle(Article):
class Index:
settings = {
"number_of_shards": 5,
"number_of_replicas": 2,
}

full_text: str = Text(required=True) # type: ignore
"""Extracted full text of the article."""
Loading

0 comments on commit 643c672

Please sign in to comment.