diff --git a/tests/mapping/test_author_map.py b/tests/mapping/test_author_map.py index 667d66a5d..1a09a7fa6 100644 --- a/tests/mapping/test_author_map.py +++ b/tests/mapping/test_author_map.py @@ -1,19 +1,33 @@ import unittest -import unittest.mock as mock -from pathlib import Path -from tempfile import NamedTemporaryFile -from varats.mapping.author_map import generate_author_map, Author -from varats.project.project_util import get_local_project_git_path +from varats.mapping.author_map import generate_author_map, Author, AuthorMap from varats.projects.discover_projects import initialize_projects +class TestAuthor(unittest.TestCase): + + def test_merge(self) -> None: + author_one = Author(1, "Jon Doe", "jon_doe@jon_doe.com") + author_two = Author(4, "J. Doe", "jon_doe@jon_doe.com") + author_three = Author(2, "Jon Doe", "j_doe@gmail.com") + merge_one = author_one.merge(author_two) + self.assertEqual(merge_one.author_id, 1) + self.assertEqual(merge_one.names, {"Jon Doe", "J. Doe"}) + self.assertEqual(merge_one.mail_addresses, {"jon_doe@jon_doe.com"}) + merge_two = author_three.merge(author_one) + self.assertEqual(merge_two.author_id, 1) + self.assertEqual(merge_two.names, {"Jon Doe", "J. Doe"}) + self.assertEqual( + merge_two.mail_addresses, + {"jon_doe@jon_doe.com", "j_doe@gmail.com"} + ) + + class TestAuthorMap(unittest.TestCase): def test_get_author_by_email(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") test_author = amap.get_author_by_email("jim@meyering.net") self.assertEqual( test_author.mail_addresses, @@ -23,8 +37,7 @@ def test_get_author_by_email(self) -> None: def test_get_author_by_name(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") test_author = amap.get_author_by_name("Jia Cheong Tan") self.assertEqual( test_author.names, {"Jia Cheong Tan", "Jia Tan", "jiat75"} @@ -36,8 +49,7 @@ def test_get_author_by_name(self) -> None: def test_get_author(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") test_author = amap.get_author("Jia Cheong Tan", "jiat75@gmail.com") self.assertEqual( test_author.names, {"Jia Cheong Tan", "Jia Tan", "jiat75"} @@ -49,26 +61,52 @@ def test_get_author(self) -> None: def test_get_author_ambiguous(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") self.assertIsNone(amap.get_author("Jia Cheong Tan", "jim@meyering.net")) def test_get_author_missing_mail(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") self.assertIsNone( amap.get_author("Jia Cheong Tan", "notpresent@None.com") ) def test_get_author_missing_name(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") self.assertIsNone(amap.get_author("Not Present", "jim@meyering.net")) def test_get_author_missing(self) -> None: initialize_projects() - git_path = get_local_project_git_path("xz") - amap = generate_author_map(git_path) + amap = generate_author_map("xz") self.assertIsNone(amap.get_author("Not Present", "notpresent@None.com")) + + def test_author_merging(self) -> None: + amap = AuthorMap() + amap.add_entry("Jon Doe", "jon_doe@jon_doe.com") + amap.add_entry("JD", "jon.d@gmail.com") + amap.add_entry("Jon Doe", "jon.d@gmail.com") + disambiguated_author = amap.get_author("JD", "jon_doe@jon_doe.com") + self.assertEqual(disambiguated_author.author_id, 0) + self.assertEqual(disambiguated_author.names, {"Jon Doe", "JD"}) + self.assertEqual( + disambiguated_author.mail_addresses, + {"jon_doe@jon_doe.com", "jon.d@gmail.com"} + ) + + def test_author_merging_generate(self) -> None: + initialize_projects() + amap = generate_author_map("brotli") + test_author = amap.get_author("eustas", "eustas@google.com") + self.assertEqual( + test_author.names, { + "eustas", "Eugene Kliuchnikov", "Eugene Klyuchnikov", + "Evgenii Kliuchnikov" + } + ) + self.assertEqual( + test_author.mail_addresses, { + "eustas@google.com", "eustas.ru@gmail.com", + "eustas@eustas-wfh.fra.corp.google.com" + } + ) diff --git a/varats-core/varats/mapping/author_map.py b/varats-core/varats/mapping/author_map.py index 4f17411cd..b64af1b76 100644 --- a/varats-core/varats/mapping/author_map.py +++ b/varats-core/varats/mapping/author_map.py @@ -4,22 +4,22 @@ import re import typing as tp from functools import reduce -from pathlib import Path -from benchbuild.utils.cmd import git, mkdir +from benchbuild.utils.cmd import git +from varats.project.project_util import get_local_project_git_path from varats.utils.git_util import __get_git_path_arg LOG = logging.getLogger(__name__) -NAME_REGEX = re.compile("\s*\d+\t(.*) <(.*)>") +NAME_REGEX = re.compile(r"\s*\d+\t(.*) <(.*)>") class Author(): """Representation of one author.""" - def __init__(self, id: int, name: str, email: str) -> None: - self.id = id + def __init__(self, author_id: int, name: str, email: str) -> None: + self.__id = author_id self.name = name self.__names = {name} self.mail = email @@ -27,14 +27,20 @@ def __init__(self, id: int, name: str, email: str) -> None: def __eq__(self, other) -> bool: if isinstance(other, Author): - return self.id == other.id + return self.author_id == other.author_id return False def __str__(self) -> str: - return f"{self.id} {self.name} <{self.mail}>" + return f"{self.name} <{self.mail}>" def __repr__(self) -> str: - return f"{self.id} {self.name} <{self.mail}>; {self.names},{self.mail_addresses}" + return f"{self.name} <{self.mail}>; {self.names},{self.mail_addresses}" + + def __lt__(self, other): + return self.author_id < other.author_id + + def __le__(self, other): + return self.author_id <= other.author_id @property def names(self) -> tp.Set[str]: @@ -44,22 +50,31 @@ def names(self) -> tp.Set[str]: def mail_addresses(self) -> tp.Set[str]: return self.__mail_addresses + @property + def author_id(self) -> int: + return self.__id + def add_data(self, name: str, mail: str) -> None: + """Add additional name and mail to the author.""" if not name in self.names: self.names.add(name) if not mail in self.mail: self.mail_addresses.add(mail) def merge(self, other: 'Author') -> 'Author': - if other.id < self.id: - other.names.union(self.names) - other.mail_addresses.union(self.mail_addresses) + """Merge two authors.""" + if other.author_id < self.author_id: + other.names.update(self.names) + other.mail_addresses.update(self.mail_addresses) return other - self.names.union(other.names) - self.mail_addresses.union(other.mail_addresses) + self.names.update(other.names) + self.mail_addresses.update(other.mail_addresses) return self + def __hash__(self) -> int: + return hash(self.author_id) + class AuthorMap(): """Provides a mapping of an author to all combinations of author name to @@ -70,7 +85,7 @@ def __init__(self) -> None: self.current_id = 0 self.mail_dict: tp.Dict[str, Author] = {} self.name_dict: tp.Dict[str, Author] = {} - self.__authors: tp.List[Author] = [] + self.__authors: tp.Set[Author] = set() def get_author_by_name(self, name: str) -> tp.Optional[Author]: if self._look_up_invalid: @@ -83,7 +98,7 @@ def get_author_by_email(self, email: str) -> tp.Optional[Author]: return self.mail_dict.get(email, None) @property - def authors(self) -> tp.List[Author]: + def authors(self) -> tp.Set[Author]: return self.__authors def get_author(self, name: str, mail: str) -> tp.Optional[Author]: @@ -110,14 +125,14 @@ def new_author_id(self) -> int: def add_entry(self, name: str, mail: str) -> None: """Add authors to the map and invalidate look up dicts.""" - ambiguos_authors = [ + ambiguos_authors = { author for author in self.__authors if name in author.names or mail in author.mail_addresses - ] + } if not ambiguos_authors: - self.__authors.append(Author(self.new_author_id(), name, mail)) - self.look_up_invalid = True + self.__authors.add(Author(self.new_author_id(), name, mail)) + self._look_up_invalid = True return if len(ambiguos_authors) > 1: @@ -125,9 +140,11 @@ def add_entry(self, name: str, mail: str) -> None: lambda accu, author: accu.merge(author), ambiguos_authors ) else: - existing_author = ambiguos_authors[0] + existing_author = ambiguos_authors.pop() existing_author.add_data(name, mail) + self.__authors = self.__authors.difference(ambiguos_authors) + self.__authors.add(existing_author) self._look_up_invalid = True def _gen_lookup_dicts(self) -> None: @@ -144,8 +161,9 @@ def __repr__(self) -> str: return f"{self.name_dict} \n {self.mail_dict}" -def generate_author_map(path: Path) -> AuthorMap: +def generate_author_map(project_name: str) -> AuthorMap: """Generate an AuthorMap for the repository at the given path.""" + path = get_local_project_git_path(project_name) author_map = AuthorMap() test = git[__get_git_path_arg(path), "shortlog", "-sne", "--all"]().strip().split("\n")