Skip to content

Commit

Permalink
Prepare Doccano tools (wip)
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Apr 25, 2024
1 parent de49a19 commit 359132d
Show file tree
Hide file tree
Showing 4 changed files with 384 additions and 2 deletions.
37 changes: 37 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,43 @@ Our hands-on tutorials lower the barrier of entry to implementing IR models and

A full [list of all covered tutorials](tutorials/README.md#contents) and further information on how to run the tutorials on your local machine can be found in the [tutorial readme](tutorials/README.md).

### Tools for Relevance Judgments (work in progress)

We also include tools that ease uploading pooled documents and downloading relevance judgments to/from the Doccano annotation platform. To use these tools, follow these steps:

1. Install [Python 3.10](https://python.org/downloads/) or later.
2. Create and activate a virtual environment:

```shell
python3.10 -m venv venv/
source venv/bin/activate
```

3. Install dependencies:

```shell
pip install -e .
```

4. Create top-k pools of documents retrieved by TIREx baseline: **TODO**

```shell
teaching-ir pool-documents [TODO: TIRA parameters] /path/to/topics1.xml [/path/to/topics2.xml ...]
```

5. Prepare the relevance judgments in Doccano like so:

```shell
teaching-ir prepare-relevance-judgments project-prefix /path/to/pool1.jsonl /path/to/pool2.jsonl ...
```

6. All teams can now work on their relevance judgments.
5. Export the relevance judgments as [Qrels](https://trec.nist.gov/data/qrels_eng/) from Doccano like so:

```shell
teaching-ir export-relevance-judgments project-prefix /path/to/pool1.jsonl /path/to/pool2.jsonl ...
```

## Archived courses

The below list includes finished (✅), ongoing (⏳) and future (🔜) IR courses that already benefit from our teaching resources.
Expand Down
3 changes: 3 additions & 0 deletions cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from importlib_metadata import version

__version__ = version("teaching-ir-with-shared-tasks")
335 changes: 335 additions & 0 deletions cli/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
from pathlib import Path
from secrets import choice
from string import ascii_letters, digits
from typing import Annotated, Any, Sequence, TypeAlias
from urllib.parse import urljoin
from warnings import warn

from annotated_types import Len
from click import argument, confirm, group, Context, Parameter, echo, option, Path as PathType
from doccano_client import DoccanoClient
from pandas import concat, read_json
from slugify import slugify
from tqdm import tqdm

from cli import __version__ as app_version


def print_version(
context: Context,
_parameter: Parameter,
value: Any,
) -> None:
if not value or context.resilient_parsing:
return
echo(app_version)
context.exit()


@group()
@option("-V", "--version", is_flag=True, callback=print_version,
expose_value=False, is_eager=True)
def cli() -> None:
pass


@cli.command()
def pool_documents() -> None:
raise NotImplementedError()


def _user_name(project_prefix: str, group: str) -> str:
group = slugify(group)
return f"{project_prefix}-{group}"


_ProjectName: TypeAlias = Annotated[str, Len(min_length=1, max_length=100)]


def _project_name(project_prefix: str, query_id: str) -> _ProjectName:
name = f"{project_prefix}-{query_id}"
if len(query_id) == 0:
raise ValueError("Empty query ID.")
if len(name) > 100:
warn(UserWarning(
f"Project name '{name}' is too long. Shortening to '{name[:100]}'."))
return name[:100]


_alphabet = ascii_letters + digits


def _generate_password(length: int = 10) -> str:
password = ""
for _ in range(length):
password += ''.join(choice(_alphabet))
return password


_DEFAULT_PROJECT_DESCRIPTION = """
Document relevance judgments. (Automatically generated by TIREx.)
""".strip()

# TODO: Add guidelines.
_DEFAULT_PROJECT_GUIDELINES = """
TODO
(Automatically generated by TIREx.)
""".strip()

_TAG = "teaching-ir" # For marking auto-generated projects.
_LABEL_RELEVANT = "relevant"
_LABEL_KEY_RELEVANT = "1"
_LABEL_COLOR_RELEVANT = "#086F02"
_LABEL_NOT_RELEVANT = "not relevant"
_LABEL_KEY_NOT_RELEVANT = "2"
_LABEL_COLOR_NOT_RELEVANT = "#D33115"


@cli.command()
@option(
"-d", "--doccano-url",
type=str,
required=True,
prompt="Doccano URL",
envvar="DOCCANO_URL",
)
@option(
"-u", "--doccano-username",
type=str,
required=True,
prompt="Doccano username",
envvar="DOCCANO_USERNAME",
)
@option(
"-u", "--doccano-password",
type=str,
required=True,
prompt="Doccano password",
hide_input=True,
envvar="DOCCANO_PASSWORD",
)
@argument(
"project_prefix",
type=str,
)
@argument(
"pooled_documents_paths",
type=PathType(
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
allow_dash=False,
path_type=Path,
),
nargs=-1,
)
def prepare_relevance_judgments(
doccano_url: str,
doccano_username: str,
doccano_password: str,
project_prefix: str,
pooled_documents_paths: Sequence[Path],
) -> None:
if len(project_prefix) == 0:
raise ValueError("Empty project prefix.")
project_prefix = slugify(project_prefix)

if len(pooled_documents_paths) == 0:
return

doccano = DoccanoClient(doccano_url)
doccano.login(
username=doccano_username,
password=doccano_password,
)
echo("Successfully authenticated with Doccano API.")

pool = concat(
read_json(
path,
lines=True,
dtype={
"group": str,
"query_id": str,
"query": str,
"description": str,
"narrative": str,
"doc_id": str,
"text": str,
}
)
for path in tqdm(
pooled_documents_paths,
desc="Read pooled documents",
unit="path",
)
)
echo(f"Found {len(pool)} pooled documents.")

groups: set[str] = set(pool["group"].drop_duplicates().to_list())
echo(f"Found {len(groups)} groups.")

query_ids: set[str] = set(pool["query_id"].drop_duplicates().to_list())
echo(f"Found {len(query_ids)} topics.")

# Create a mapping of expected users.
expected_users_dict = {
group: _user_name(project_prefix, group)
for group in groups
}
expected_users = set(expected_users_dict.values())

# Create a mapping of expected projects.
expected_projects_dict = {
query_id: _project_name(project_prefix, query_id)
for query_id in query_ids
}
expected_projects = set(expected_projects_dict.values())

# Create missing users.
all_users = {
user.username: user
for user in doccano.search_users()
}
existing_users = expected_users & all_users.keys()
non_existing_users = expected_users - all_users.keys()
echo(f"On Doccano, {len(existing_users)} users already exist, "
f"{len(non_existing_users)} need to be created.")
if len(non_existing_users) > 0:
print("Creating missing users...")
for user in non_existing_users:
password = _generate_password()
new_user = doccano.create_user(user, password)
all_users[new_user.id] = new_user
print(f"Created user '{user}' with password '{password}'.")

# Create missing projects.
all_projects = {
project.name: project
for project in doccano.list_projects()
}
existing_projects = expected_projects & all_projects.keys()
non_existing_projects = expected_projects - all_projects.keys()
echo(f"On Doccano, {len(existing_projects)} projects already exist, "
f"{len(non_existing_projects)} need to be created.")
if len(non_existing_projects) > 0:
print("Creating missing projects...")
for project in non_existing_projects:
new_project = doccano.project.create(
name=project,
description=_DEFAULT_PROJECT_DESCRIPTION,
project_type="DocumentClassification",
guideline=_DEFAULT_PROJECT_GUIDELINES,
random_order=False,
collaborative_annotation=True,
single_class_classification=True,
tags=[_TAG]
)
all_projects[new_project.id] = new_project
print(f"Created project '{project}'.")

unmanaged_projects = {
project
for project in existing_projects
if _TAG not in all_projects[project].tags
}
if len(unmanaged_projects) > 0:
print("Checking previously unmanaged projects...")
for project in unmanaged_projects:
project_id = all_projects[project].id
project_url = urljoin(
doccano_url, f"/projects/{project_id}")
if not confirm(f"The Doccano project '{project}' ({project_url}) does not appear to be generated by this tool. Overwrite project"):
raise RuntimeError(
f"Cannot prepare judgments due to clash with unmanaged Doccano project '{project}'.")
doccano.project.update(
project_id=project_id,
tags=list({*all_projects[project].tags, _TAG})
)

print("Preparing projects...")
for project in expected_projects:
project_id = all_projects[project].id

echo(f"Preparing labels for Doccano project '{project}'.")
existing_labels = doccano.list_label_types(
project_id=project_id,
type="category",
)
label_relevant = next((
label for label in existing_labels
if label.text == _LABEL_RELEVANT
), None)
label_not_relevant = next((
label for label in existing_labels
if label.text == _LABEL_NOT_RELEVANT
), None)
compatible_label_ids = []
if label_relevant is not None:
doccano.update_label_type(
project_id=project_id,
label_type_id=label_relevant.id,
type="category",
text=_LABEL_RELEVANT,
prefix_key=None,
suffix_key=_LABEL_KEY_RELEVANT,
color=_LABEL_COLOR_RELEVANT,
)
compatible_label_ids.append(label_relevant.id)
else:
doccano.create_label_type(
project_id=project_id,
type="category",
text=_LABEL_RELEVANT,
prefix_key=None,
suffix_key=_LABEL_KEY_RELEVANT,
color=_LABEL_COLOR_RELEVANT,
)
if label_not_relevant is not None:
doccano.update_label_type(
project_id=project_id,
label_type_id=label_not_relevant.id,
type="category",
text=_LABEL_NOT_RELEVANT,
prefix_key=None,
suffix_key=_LABEL_KEY_NOT_RELEVANT,
color=_LABEL_COLOR_NOT_RELEVANT,
)
compatible_label_ids.append(label_not_relevant.id)
else:
doccano.create_label_type(
project_id=project_id,
type="category",
text=_LABEL_NOT_RELEVANT,
prefix_key=None,
suffix_key=_LABEL_KEY_NOT_RELEVANT,
color=_LABEL_COLOR_NOT_RELEVANT,
)
incompatible_labels = [
label for label in existing_labels
if label.id not in compatible_label_ids
]
if len(incompatible_labels) > 0:
confirm(
f"Found {len(incompatible_labels)} incompatible labels "
f"for Doccano project '{project}'. Delete labels", abort=True)
doccano.bulk_delete_label_types(
project_id=project_id,
label_type_ids=[label.id for label in incompatible_labels],
type="category",
)

echo(f"Preparing annotators for Doccano project '{project}'.")
doccano.list_members
# TODO: Update project members.

echo(f"Preparing documents for Doccano project '{project}'.")
# TODO: Update documents.


if __name__ == "__main__":
# pylint: disable=E1120
cli()
Loading

0 comments on commit 359132d

Please sign in to comment.