diff --git a/README.md b/README.md index 0092dbf..32cff20 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,43 @@ Our hands-on tutorials lower the barrier of entry to implementing IR models and A full [list of all covered tutorials](tutorials/README.md#contents) and further information on how to run the tutorials on your local machine can be found in the [tutorial readme](tutorials/README.md). +### Tools for Relevance Judgments (work in progress) + +We also include tools that ease uploading pooled documents and downloading relevance judgments to/from the Doccano annotation platform. To use these tools, follow these steps: + +1. Install [Python 3.10](https://python.org/downloads/) or later. +2. Create and activate a virtual environment: + + ```shell + python3.10 -m venv venv/ + source venv/bin/activate + ``` + +3. Install dependencies: + + ```shell + pip install -e . + ``` + +4. Create top-k pools of documents retrieved by TIREx baseline: **TODO** + + ```shell + teaching-ir pool-documents [TODO: TIRA parameters] /path/to/topics1.xml [/path/to/topics2.xml ...] + ``` + +5. Prepare the relevance judgments in Doccano like so: + + ```shell + teaching-ir prepare-relevance-judgments project-prefix /path/to/pool1.jsonl /path/to/pool2.jsonl ... + ``` + +6. All teams can now work on their relevance judgments. +5. Export the relevance judgments as [Qrels](https://trec.nist.gov/data/qrels_eng/) from Doccano like so: + + ```shell + teaching-ir export-relevance-judgments project-prefix /path/to/pool1.jsonl /path/to/pool2.jsonl ... + ``` + ## Archived courses The below list includes finished (✅), ongoing (⏳) and future (🔜) IR courses that already benefit from our teaching resources. diff --git a/cli/__init__.py b/cli/__init__.py new file mode 100644 index 0000000..b0ad0d0 --- /dev/null +++ b/cli/__init__.py @@ -0,0 +1,3 @@ +from importlib_metadata import version + +__version__ = version("teaching-ir-with-shared-tasks") diff --git a/cli/__main__.py b/cli/__main__.py new file mode 100644 index 0000000..2c5d13b --- /dev/null +++ b/cli/__main__.py @@ -0,0 +1,335 @@ +from pathlib import Path +from secrets import choice +from string import ascii_letters, digits +from typing import Annotated, Any, Sequence, TypeAlias +from urllib.parse import urljoin +from warnings import warn + +from annotated_types import Len +from click import argument, confirm, group, Context, Parameter, echo, option, Path as PathType +from doccano_client import DoccanoClient +from pandas import concat, read_json +from slugify import slugify +from tqdm import tqdm + +from cli import __version__ as app_version + + +def print_version( + context: Context, + _parameter: Parameter, + value: Any, +) -> None: + if not value or context.resilient_parsing: + return + echo(app_version) + context.exit() + + +@group() +@option("-V", "--version", is_flag=True, callback=print_version, + expose_value=False, is_eager=True) +def cli() -> None: + pass + + +@cli.command() +def pool_documents() -> None: + raise NotImplementedError() + + +def _user_name(project_prefix: str, group: str) -> str: + group = slugify(group) + return f"{project_prefix}-{group}" + + +_ProjectName: TypeAlias = Annotated[str, Len(min_length=1, max_length=100)] + + +def _project_name(project_prefix: str, query_id: str) -> _ProjectName: + name = f"{project_prefix}-{query_id}" + if len(query_id) == 0: + raise ValueError("Empty query ID.") + if len(name) > 100: + warn(UserWarning( + f"Project name '{name}' is too long. Shortening to '{name[:100]}'.")) + return name[:100] + + +_alphabet = ascii_letters + digits + + +def _generate_password(length: int = 10) -> str: + password = "" + for _ in range(length): + password += ''.join(choice(_alphabet)) + return password + + +_DEFAULT_PROJECT_DESCRIPTION = """ +Document relevance judgments. (Automatically generated by TIREx.) +""".strip() + +# TODO: Add guidelines. +_DEFAULT_PROJECT_GUIDELINES = """ +TODO +(Automatically generated by TIREx.) +""".strip() + +_TAG = "teaching-ir" # For marking auto-generated projects. +_LABEL_RELEVANT = "relevant" +_LABEL_KEY_RELEVANT = "1" +_LABEL_COLOR_RELEVANT = "#086F02" +_LABEL_NOT_RELEVANT = "not relevant" +_LABEL_KEY_NOT_RELEVANT = "2" +_LABEL_COLOR_NOT_RELEVANT = "#D33115" + + +@cli.command() +@option( + "-d", "--doccano-url", + type=str, + required=True, + prompt="Doccano URL", + envvar="DOCCANO_URL", +) +@option( + "-u", "--doccano-username", + type=str, + required=True, + prompt="Doccano username", + envvar="DOCCANO_USERNAME", +) +@option( + "-u", "--doccano-password", + type=str, + required=True, + prompt="Doccano password", + hide_input=True, + envvar="DOCCANO_PASSWORD", +) +@argument( + "project_prefix", + type=str, +) +@argument( + "pooled_documents_paths", + type=PathType( + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + allow_dash=False, + path_type=Path, + ), + nargs=-1, +) +def prepare_relevance_judgments( + doccano_url: str, + doccano_username: str, + doccano_password: str, + project_prefix: str, + pooled_documents_paths: Sequence[Path], +) -> None: + if len(project_prefix) == 0: + raise ValueError("Empty project prefix.") + project_prefix = slugify(project_prefix) + + if len(pooled_documents_paths) == 0: + return + + doccano = DoccanoClient(doccano_url) + doccano.login( + username=doccano_username, + password=doccano_password, + ) + echo("Successfully authenticated with Doccano API.") + + pool = concat( + read_json( + path, + lines=True, + dtype={ + "group": str, + "query_id": str, + "query": str, + "description": str, + "narrative": str, + "doc_id": str, + "text": str, + } + ) + for path in tqdm( + pooled_documents_paths, + desc="Read pooled documents", + unit="path", + ) + ) + echo(f"Found {len(pool)} pooled documents.") + + groups: set[str] = set(pool["group"].drop_duplicates().to_list()) + echo(f"Found {len(groups)} groups.") + + query_ids: set[str] = set(pool["query_id"].drop_duplicates().to_list()) + echo(f"Found {len(query_ids)} topics.") + + # Create a mapping of expected users. + expected_users_dict = { + group: _user_name(project_prefix, group) + for group in groups + } + expected_users = set(expected_users_dict.values()) + + # Create a mapping of expected projects. + expected_projects_dict = { + query_id: _project_name(project_prefix, query_id) + for query_id in query_ids + } + expected_projects = set(expected_projects_dict.values()) + + # Create missing users. + all_users = { + user.username: user + for user in doccano.search_users() + } + existing_users = expected_users & all_users.keys() + non_existing_users = expected_users - all_users.keys() + echo(f"On Doccano, {len(existing_users)} users already exist, " + f"{len(non_existing_users)} need to be created.") + if len(non_existing_users) > 0: + print("Creating missing users...") + for user in non_existing_users: + password = _generate_password() + new_user = doccano.create_user(user, password) + all_users[new_user.id] = new_user + print(f"Created user '{user}' with password '{password}'.") + + # Create missing projects. + all_projects = { + project.name: project + for project in doccano.list_projects() + } + existing_projects = expected_projects & all_projects.keys() + non_existing_projects = expected_projects - all_projects.keys() + echo(f"On Doccano, {len(existing_projects)} projects already exist, " + f"{len(non_existing_projects)} need to be created.") + if len(non_existing_projects) > 0: + print("Creating missing projects...") + for project in non_existing_projects: + new_project = doccano.project.create( + name=project, + description=_DEFAULT_PROJECT_DESCRIPTION, + project_type="DocumentClassification", + guideline=_DEFAULT_PROJECT_GUIDELINES, + random_order=False, + collaborative_annotation=True, + single_class_classification=True, + tags=[_TAG] + ) + all_projects[new_project.id] = new_project + print(f"Created project '{project}'.") + + unmanaged_projects = { + project + for project in existing_projects + if _TAG not in all_projects[project].tags + } + if len(unmanaged_projects) > 0: + print("Checking previously unmanaged projects...") + for project in unmanaged_projects: + project_id = all_projects[project].id + project_url = urljoin( + doccano_url, f"/projects/{project_id}") + if not confirm(f"The Doccano project '{project}' ({project_url}) does not appear to be generated by this tool. Overwrite project"): + raise RuntimeError( + f"Cannot prepare judgments due to clash with unmanaged Doccano project '{project}'.") + doccano.project.update( + project_id=project_id, + tags=list({*all_projects[project].tags, _TAG}) + ) + + print("Preparing projects...") + for project in expected_projects: + project_id = all_projects[project].id + + echo(f"Preparing labels for Doccano project '{project}'.") + existing_labels = doccano.list_label_types( + project_id=project_id, + type="category", + ) + label_relevant = next(( + label for label in existing_labels + if label.text == _LABEL_RELEVANT + ), None) + label_not_relevant = next(( + label for label in existing_labels + if label.text == _LABEL_NOT_RELEVANT + ), None) + compatible_label_ids = [] + if label_relevant is not None: + doccano.update_label_type( + project_id=project_id, + label_type_id=label_relevant.id, + type="category", + text=_LABEL_RELEVANT, + prefix_key=None, + suffix_key=_LABEL_KEY_RELEVANT, + color=_LABEL_COLOR_RELEVANT, + ) + compatible_label_ids.append(label_relevant.id) + else: + doccano.create_label_type( + project_id=project_id, + type="category", + text=_LABEL_RELEVANT, + prefix_key=None, + suffix_key=_LABEL_KEY_RELEVANT, + color=_LABEL_COLOR_RELEVANT, + ) + if label_not_relevant is not None: + doccano.update_label_type( + project_id=project_id, + label_type_id=label_not_relevant.id, + type="category", + text=_LABEL_NOT_RELEVANT, + prefix_key=None, + suffix_key=_LABEL_KEY_NOT_RELEVANT, + color=_LABEL_COLOR_NOT_RELEVANT, + ) + compatible_label_ids.append(label_not_relevant.id) + else: + doccano.create_label_type( + project_id=project_id, + type="category", + text=_LABEL_NOT_RELEVANT, + prefix_key=None, + suffix_key=_LABEL_KEY_NOT_RELEVANT, + color=_LABEL_COLOR_NOT_RELEVANT, + ) + incompatible_labels = [ + label for label in existing_labels + if label.id not in compatible_label_ids + ] + if len(incompatible_labels) > 0: + confirm( + f"Found {len(incompatible_labels)} incompatible labels " + f"for Doccano project '{project}'. Delete labels", abort=True) + doccano.bulk_delete_label_types( + project_id=project_id, + label_type_ids=[label.id for label in incompatible_labels], + type="category", + ) + + echo(f"Preparing annotators for Doccano project '{project}'.") + doccano.list_members + # TODO: Update project members. + + echo(f"Preparing documents for Doccano project '{project}'.") + # TODO: Update documents. + + +if __name__ == "__main__": + # pylint: disable=E1120 + cli() diff --git a/pyproject.toml b/pyproject.toml index 2a05993..18ea2c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,23 @@ [project] name = "teaching-ir-with-shared-tasks" -requires-python = ">=3.11" +requires-python = ">=3.10" dependencies = [ + "annotated-types~=0.6.0", "click~=8.1", - "doccano-client~=1.2.8", + # Temporary fix of https://github.com/doccano/doccano-client/issues/158 + "doccano-client @ git+https://github.com/heinrichreimer/doccano-client.git@c36a6284e941f774ed1eafb198933d51fb05caec", + # "doccano-client~=1.2.8", "importlib-metadata~=7.0", "ir_datasets~=0.5.6", + "python-slugify~=8.0", "tira~=0.0.125", "urllib3~=2.2", ] version = "0.1.0" +[project.scripts] +teaching-ir = "cli.__main__:cli" + [tool.setuptools] include-package-data = true