-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
de49a19
commit 359132d
Showing
4 changed files
with
384 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from importlib_metadata import version | ||
|
||
__version__ = version("teaching-ir-with-shared-tasks") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,335 @@ | ||
from pathlib import Path | ||
from secrets import choice | ||
from string import ascii_letters, digits | ||
from typing import Annotated, Any, Sequence, TypeAlias | ||
from urllib.parse import urljoin | ||
from warnings import warn | ||
|
||
from annotated_types import Len | ||
from click import argument, confirm, group, Context, Parameter, echo, option, Path as PathType | ||
from doccano_client import DoccanoClient | ||
from pandas import concat, read_json | ||
from slugify import slugify | ||
from tqdm import tqdm | ||
|
||
from cli import __version__ as app_version | ||
|
||
|
||
def print_version( | ||
context: Context, | ||
_parameter: Parameter, | ||
value: Any, | ||
) -> None: | ||
if not value or context.resilient_parsing: | ||
return | ||
echo(app_version) | ||
context.exit() | ||
|
||
|
||
@group() | ||
@option("-V", "--version", is_flag=True, callback=print_version, | ||
expose_value=False, is_eager=True) | ||
def cli() -> None: | ||
pass | ||
|
||
|
||
@cli.command() | ||
def pool_documents() -> None: | ||
raise NotImplementedError() | ||
|
||
|
||
def _user_name(project_prefix: str, group: str) -> str: | ||
group = slugify(group) | ||
return f"{project_prefix}-{group}" | ||
|
||
|
||
_ProjectName: TypeAlias = Annotated[str, Len(min_length=1, max_length=100)] | ||
|
||
|
||
def _project_name(project_prefix: str, query_id: str) -> _ProjectName: | ||
name = f"{project_prefix}-{query_id}" | ||
if len(query_id) == 0: | ||
raise ValueError("Empty query ID.") | ||
if len(name) > 100: | ||
warn(UserWarning( | ||
f"Project name '{name}' is too long. Shortening to '{name[:100]}'.")) | ||
return name[:100] | ||
|
||
|
||
_alphabet = ascii_letters + digits | ||
|
||
|
||
def _generate_password(length: int = 10) -> str: | ||
password = "" | ||
for _ in range(length): | ||
password += ''.join(choice(_alphabet)) | ||
return password | ||
|
||
|
||
_DEFAULT_PROJECT_DESCRIPTION = """ | ||
Document relevance judgments. (Automatically generated by TIREx.) | ||
""".strip() | ||
|
||
# TODO: Add guidelines. | ||
_DEFAULT_PROJECT_GUIDELINES = """ | ||
TODO | ||
(Automatically generated by TIREx.) | ||
""".strip() | ||
|
||
_TAG = "teaching-ir" # For marking auto-generated projects. | ||
_LABEL_RELEVANT = "relevant" | ||
_LABEL_KEY_RELEVANT = "1" | ||
_LABEL_COLOR_RELEVANT = "#086F02" | ||
_LABEL_NOT_RELEVANT = "not relevant" | ||
_LABEL_KEY_NOT_RELEVANT = "2" | ||
_LABEL_COLOR_NOT_RELEVANT = "#D33115" | ||
|
||
|
||
@cli.command() | ||
@option( | ||
"-d", "--doccano-url", | ||
type=str, | ||
required=True, | ||
prompt="Doccano URL", | ||
envvar="DOCCANO_URL", | ||
) | ||
@option( | ||
"-u", "--doccano-username", | ||
type=str, | ||
required=True, | ||
prompt="Doccano username", | ||
envvar="DOCCANO_USERNAME", | ||
) | ||
@option( | ||
"-u", "--doccano-password", | ||
type=str, | ||
required=True, | ||
prompt="Doccano password", | ||
hide_input=True, | ||
envvar="DOCCANO_PASSWORD", | ||
) | ||
@argument( | ||
"project_prefix", | ||
type=str, | ||
) | ||
@argument( | ||
"pooled_documents_paths", | ||
type=PathType( | ||
exists=True, | ||
file_okay=True, | ||
dir_okay=False, | ||
writable=False, | ||
readable=True, | ||
resolve_path=True, | ||
allow_dash=False, | ||
path_type=Path, | ||
), | ||
nargs=-1, | ||
) | ||
def prepare_relevance_judgments( | ||
doccano_url: str, | ||
doccano_username: str, | ||
doccano_password: str, | ||
project_prefix: str, | ||
pooled_documents_paths: Sequence[Path], | ||
) -> None: | ||
if len(project_prefix) == 0: | ||
raise ValueError("Empty project prefix.") | ||
project_prefix = slugify(project_prefix) | ||
|
||
if len(pooled_documents_paths) == 0: | ||
return | ||
|
||
doccano = DoccanoClient(doccano_url) | ||
doccano.login( | ||
username=doccano_username, | ||
password=doccano_password, | ||
) | ||
echo("Successfully authenticated with Doccano API.") | ||
|
||
pool = concat( | ||
read_json( | ||
path, | ||
lines=True, | ||
dtype={ | ||
"group": str, | ||
"query_id": str, | ||
"query": str, | ||
"description": str, | ||
"narrative": str, | ||
"doc_id": str, | ||
"text": str, | ||
} | ||
) | ||
for path in tqdm( | ||
pooled_documents_paths, | ||
desc="Read pooled documents", | ||
unit="path", | ||
) | ||
) | ||
echo(f"Found {len(pool)} pooled documents.") | ||
|
||
groups: set[str] = set(pool["group"].drop_duplicates().to_list()) | ||
echo(f"Found {len(groups)} groups.") | ||
|
||
query_ids: set[str] = set(pool["query_id"].drop_duplicates().to_list()) | ||
echo(f"Found {len(query_ids)} topics.") | ||
|
||
# Create a mapping of expected users. | ||
expected_users_dict = { | ||
group: _user_name(project_prefix, group) | ||
for group in groups | ||
} | ||
expected_users = set(expected_users_dict.values()) | ||
|
||
# Create a mapping of expected projects. | ||
expected_projects_dict = { | ||
query_id: _project_name(project_prefix, query_id) | ||
for query_id in query_ids | ||
} | ||
expected_projects = set(expected_projects_dict.values()) | ||
|
||
# Create missing users. | ||
all_users = { | ||
user.username: user | ||
for user in doccano.search_users() | ||
} | ||
existing_users = expected_users & all_users.keys() | ||
non_existing_users = expected_users - all_users.keys() | ||
echo(f"On Doccano, {len(existing_users)} users already exist, " | ||
f"{len(non_existing_users)} need to be created.") | ||
if len(non_existing_users) > 0: | ||
print("Creating missing users...") | ||
for user in non_existing_users: | ||
password = _generate_password() | ||
new_user = doccano.create_user(user, password) | ||
all_users[new_user.id] = new_user | ||
print(f"Created user '{user}' with password '{password}'.") | ||
|
||
# Create missing projects. | ||
all_projects = { | ||
project.name: project | ||
for project in doccano.list_projects() | ||
} | ||
existing_projects = expected_projects & all_projects.keys() | ||
non_existing_projects = expected_projects - all_projects.keys() | ||
echo(f"On Doccano, {len(existing_projects)} projects already exist, " | ||
f"{len(non_existing_projects)} need to be created.") | ||
if len(non_existing_projects) > 0: | ||
print("Creating missing projects...") | ||
for project in non_existing_projects: | ||
new_project = doccano.project.create( | ||
name=project, | ||
description=_DEFAULT_PROJECT_DESCRIPTION, | ||
project_type="DocumentClassification", | ||
guideline=_DEFAULT_PROJECT_GUIDELINES, | ||
random_order=False, | ||
collaborative_annotation=True, | ||
single_class_classification=True, | ||
tags=[_TAG] | ||
) | ||
all_projects[new_project.id] = new_project | ||
print(f"Created project '{project}'.") | ||
|
||
unmanaged_projects = { | ||
project | ||
for project in existing_projects | ||
if _TAG not in all_projects[project].tags | ||
} | ||
if len(unmanaged_projects) > 0: | ||
print("Checking previously unmanaged projects...") | ||
for project in unmanaged_projects: | ||
project_id = all_projects[project].id | ||
project_url = urljoin( | ||
doccano_url, f"/projects/{project_id}") | ||
if not confirm(f"The Doccano project '{project}' ({project_url}) does not appear to be generated by this tool. Overwrite project"): | ||
raise RuntimeError( | ||
f"Cannot prepare judgments due to clash with unmanaged Doccano project '{project}'.") | ||
doccano.project.update( | ||
project_id=project_id, | ||
tags=list({*all_projects[project].tags, _TAG}) | ||
) | ||
|
||
print("Preparing projects...") | ||
for project in expected_projects: | ||
project_id = all_projects[project].id | ||
|
||
echo(f"Preparing labels for Doccano project '{project}'.") | ||
existing_labels = doccano.list_label_types( | ||
project_id=project_id, | ||
type="category", | ||
) | ||
label_relevant = next(( | ||
label for label in existing_labels | ||
if label.text == _LABEL_RELEVANT | ||
), None) | ||
label_not_relevant = next(( | ||
label for label in existing_labels | ||
if label.text == _LABEL_NOT_RELEVANT | ||
), None) | ||
compatible_label_ids = [] | ||
if label_relevant is not None: | ||
doccano.update_label_type( | ||
project_id=project_id, | ||
label_type_id=label_relevant.id, | ||
type="category", | ||
text=_LABEL_RELEVANT, | ||
prefix_key=None, | ||
suffix_key=_LABEL_KEY_RELEVANT, | ||
color=_LABEL_COLOR_RELEVANT, | ||
) | ||
compatible_label_ids.append(label_relevant.id) | ||
else: | ||
doccano.create_label_type( | ||
project_id=project_id, | ||
type="category", | ||
text=_LABEL_RELEVANT, | ||
prefix_key=None, | ||
suffix_key=_LABEL_KEY_RELEVANT, | ||
color=_LABEL_COLOR_RELEVANT, | ||
) | ||
if label_not_relevant is not None: | ||
doccano.update_label_type( | ||
project_id=project_id, | ||
label_type_id=label_not_relevant.id, | ||
type="category", | ||
text=_LABEL_NOT_RELEVANT, | ||
prefix_key=None, | ||
suffix_key=_LABEL_KEY_NOT_RELEVANT, | ||
color=_LABEL_COLOR_NOT_RELEVANT, | ||
) | ||
compatible_label_ids.append(label_not_relevant.id) | ||
else: | ||
doccano.create_label_type( | ||
project_id=project_id, | ||
type="category", | ||
text=_LABEL_NOT_RELEVANT, | ||
prefix_key=None, | ||
suffix_key=_LABEL_KEY_NOT_RELEVANT, | ||
color=_LABEL_COLOR_NOT_RELEVANT, | ||
) | ||
incompatible_labels = [ | ||
label for label in existing_labels | ||
if label.id not in compatible_label_ids | ||
] | ||
if len(incompatible_labels) > 0: | ||
confirm( | ||
f"Found {len(incompatible_labels)} incompatible labels " | ||
f"for Doccano project '{project}'. Delete labels", abort=True) | ||
doccano.bulk_delete_label_types( | ||
project_id=project_id, | ||
label_type_ids=[label.id for label in incompatible_labels], | ||
type="category", | ||
) | ||
|
||
echo(f"Preparing annotators for Doccano project '{project}'.") | ||
doccano.list_members | ||
# TODO: Update project members. | ||
|
||
echo(f"Preparing documents for Doccano project '{project}'.") | ||
# TODO: Update documents. | ||
|
||
|
||
if __name__ == "__main__": | ||
# pylint: disable=E1120 | ||
cli() |
Oops, something went wrong.