Skip to content

Commit

Permalink
Fixes author merging and adds tests for author merging (#802)
Browse files Browse the repository at this point in the history
Fix the merging of authors when a new entry provides a link between existing authors. And add tests for this.
Also change author map generation to take a project name instead of a path, to conform with commit_map generation.
  • Loading branch information
Sinerum authored Jun 27, 2023
1 parent c5cb6c3 commit 514c690
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 40 deletions.
76 changes: 57 additions & 19 deletions tests/mapping/test_author_map.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
import unittest
import unittest.mock as mock
from pathlib import Path
from tempfile import NamedTemporaryFile

from varats.mapping.author_map import generate_author_map, Author
from varats.project.project_util import get_local_project_git_path
from varats.mapping.author_map import generate_author_map, Author, AuthorMap
from varats.projects.discover_projects import initialize_projects


class TestAuthor(unittest.TestCase):

def test_merge(self) -> None:
author_one = Author(1, "Jon Doe", "jon_doe@jon_doe.com")
author_two = Author(4, "J. Doe", "jon_doe@jon_doe.com")
author_three = Author(2, "Jon Doe", "[email protected]")
merge_one = author_one.merge(author_two)
self.assertEqual(merge_one.author_id, 1)
self.assertEqual(merge_one.names, {"Jon Doe", "J. Doe"})
self.assertEqual(merge_one.mail_addresses, {"jon_doe@jon_doe.com"})
merge_two = author_three.merge(author_one)
self.assertEqual(merge_two.author_id, 1)
self.assertEqual(merge_two.names, {"Jon Doe", "J. Doe"})
self.assertEqual(
merge_two.mail_addresses,
{"jon_doe@jon_doe.com", "[email protected]"}
)


class TestAuthorMap(unittest.TestCase):

def test_get_author_by_email(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
test_author = amap.get_author_by_email("[email protected]")
self.assertEqual(
test_author.mail_addresses,
Expand All @@ -23,8 +37,7 @@ def test_get_author_by_email(self) -> None:

def test_get_author_by_name(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
test_author = amap.get_author_by_name("Jia Cheong Tan")
self.assertEqual(
test_author.names, {"Jia Cheong Tan", "Jia Tan", "jiat75"}
Expand All @@ -36,8 +49,7 @@ def test_get_author_by_name(self) -> None:

def test_get_author(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
test_author = amap.get_author("Jia Cheong Tan", "[email protected]")
self.assertEqual(
test_author.names, {"Jia Cheong Tan", "Jia Tan", "jiat75"}
Expand All @@ -49,26 +61,52 @@ def test_get_author(self) -> None:

def test_get_author_ambiguous(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
self.assertIsNone(amap.get_author("Jia Cheong Tan", "[email protected]"))

def test_get_author_missing_mail(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
self.assertIsNone(
amap.get_author("Jia Cheong Tan", "[email protected]")
)

def test_get_author_missing_name(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
self.assertIsNone(amap.get_author("Not Present", "[email protected]"))

def test_get_author_missing(self) -> None:
initialize_projects()
git_path = get_local_project_git_path("xz")
amap = generate_author_map(git_path)
amap = generate_author_map("xz")
self.assertIsNone(amap.get_author("Not Present", "[email protected]"))

def test_author_merging(self) -> None:
amap = AuthorMap()
amap.add_entry("Jon Doe", "jon_doe@jon_doe.com")
amap.add_entry("JD", "[email protected]")
amap.add_entry("Jon Doe", "[email protected]")
disambiguated_author = amap.get_author("JD", "jon_doe@jon_doe.com")
self.assertEqual(disambiguated_author.author_id, 0)
self.assertEqual(disambiguated_author.names, {"Jon Doe", "JD"})
self.assertEqual(
disambiguated_author.mail_addresses,
{"jon_doe@jon_doe.com", "[email protected]"}
)

def test_author_merging_generate(self) -> None:
initialize_projects()
amap = generate_author_map("brotli")
test_author = amap.get_author("eustas", "[email protected]")
self.assertEqual(
test_author.names, {
"eustas", "Eugene Kliuchnikov", "Eugene Klyuchnikov",
"Evgenii Kliuchnikov"
}
)
self.assertEqual(
test_author.mail_addresses, {
"[email protected]", "[email protected]",
"[email protected]"
}
)
60 changes: 39 additions & 21 deletions varats-core/varats/mapping/author_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,43 @@
import re
import typing as tp
from functools import reduce
from pathlib import Path

from benchbuild.utils.cmd import git, mkdir
from benchbuild.utils.cmd import git

from varats.project.project_util import get_local_project_git_path
from varats.utils.git_util import __get_git_path_arg

LOG = logging.getLogger(__name__)

NAME_REGEX = re.compile("\s*\d+\t(.*) <(.*)>")
NAME_REGEX = re.compile(r"\s*\d+\t(.*) <(.*)>")


class Author():
"""Representation of one author."""

def __init__(self, id: int, name: str, email: str) -> None:
self.id = id
def __init__(self, author_id: int, name: str, email: str) -> None:
self.__id = author_id
self.name = name
self.__names = {name}
self.mail = email
self.__mail_addresses = {email}

def __eq__(self, other) -> bool:
if isinstance(other, Author):
return self.id == other.id
return self.author_id == other.author_id
return False

def __str__(self) -> str:
return f"{self.id} {self.name} <{self.mail}>"
return f"{self.name} <{self.mail}>"

def __repr__(self) -> str:
return f"{self.id} {self.name} <{self.mail}>; {self.names},{self.mail_addresses}"
return f"{self.name} <{self.mail}>; {self.names},{self.mail_addresses}"

def __lt__(self, other):
return self.author_id < other.author_id

def __le__(self, other):
return self.author_id <= other.author_id

@property
def names(self) -> tp.Set[str]:
Expand All @@ -44,22 +50,31 @@ def names(self) -> tp.Set[str]:
def mail_addresses(self) -> tp.Set[str]:
return self.__mail_addresses

@property
def author_id(self) -> int:
return self.__id

def add_data(self, name: str, mail: str) -> None:
"""Add additional name and mail to the author."""
if not name in self.names:
self.names.add(name)
if not mail in self.mail:
self.mail_addresses.add(mail)

def merge(self, other: 'Author') -> 'Author':
if other.id < self.id:
other.names.union(self.names)
other.mail_addresses.union(self.mail_addresses)
"""Merge two authors."""
if other.author_id < self.author_id:
other.names.update(self.names)
other.mail_addresses.update(self.mail_addresses)
return other

self.names.union(other.names)
self.mail_addresses.union(other.mail_addresses)
self.names.update(other.names)
self.mail_addresses.update(other.mail_addresses)
return self

def __hash__(self) -> int:
return hash(self.author_id)


class AuthorMap():
"""Provides a mapping of an author to all combinations of author name to
Expand All @@ -70,7 +85,7 @@ def __init__(self) -> None:
self.current_id = 0
self.mail_dict: tp.Dict[str, Author] = {}
self.name_dict: tp.Dict[str, Author] = {}
self.__authors: tp.List[Author] = []
self.__authors: tp.Set[Author] = set()

def get_author_by_name(self, name: str) -> tp.Optional[Author]:
if self._look_up_invalid:
Expand All @@ -83,7 +98,7 @@ def get_author_by_email(self, email: str) -> tp.Optional[Author]:
return self.mail_dict.get(email, None)

@property
def authors(self) -> tp.List[Author]:
def authors(self) -> tp.Set[Author]:
return self.__authors

def get_author(self, name: str, mail: str) -> tp.Optional[Author]:
Expand All @@ -110,24 +125,26 @@ def new_author_id(self) -> int:

def add_entry(self, name: str, mail: str) -> None:
"""Add authors to the map and invalidate look up dicts."""
ambiguos_authors = [
ambiguos_authors = {
author for author in self.__authors
if name in author.names or mail in author.mail_addresses
]
}

if not ambiguos_authors:
self.__authors.append(Author(self.new_author_id(), name, mail))
self.look_up_invalid = True
self.__authors.add(Author(self.new_author_id(), name, mail))
self._look_up_invalid = True
return

if len(ambiguos_authors) > 1:
existing_author = reduce(
lambda accu, author: accu.merge(author), ambiguos_authors
)
else:
existing_author = ambiguos_authors[0]
existing_author = ambiguos_authors.pop()

existing_author.add_data(name, mail)
self.__authors = self.__authors.difference(ambiguos_authors)
self.__authors.add(existing_author)
self._look_up_invalid = True

def _gen_lookup_dicts(self) -> None:
Expand All @@ -144,8 +161,9 @@ def __repr__(self) -> str:
return f"{self.name_dict} \n {self.mail_dict}"


def generate_author_map(path: Path) -> AuthorMap:
def generate_author_map(project_name: str) -> AuthorMap:
"""Generate an AuthorMap for the repository at the given path."""
path = get_local_project_git_path(project_name)
author_map = AuthorMap()
test = git[__get_git_path_arg(path), "shortlog", "-sne",
"--all"]().strip().split("\n")
Expand Down

0 comments on commit 514c690

Please sign in to comment.