diff --git a/CONTRIBUTORS.json b/CONTRIBUTORS.json new file mode 100644 index 000000000..5c50c1db3 --- /dev/null +++ b/CONTRIBUTORS.json @@ -0,0 +1,143 @@ +{ + "forked_at": "2017-11-27T11:53:17-05:00", + "excluded_fork": null, + "pre_fork_contributors_by_name": null, + "contributors_by_name": { + "Alex Balashov": { + "emails": [ + "alexander_balashov@hms.harvard.edu" + ], + "names": [ + "Alex Balashov", + "Alexander Balashov" + ] + }, + "Alexander Veit": { + "emails": [ + "53857412+alexander-veit@users.noreply.github.com" + ], + "names": [ + "Alexander Veit" + ] + }, + "Andrea Cosolo": { + "emails": [ + "58441550+andreacosolo@users.noreply.github.com" + ], + "names": [ + "Andrea Cosolo" + ] + }, + "Andy Schroeder": { + "emails": [ + "andrew_schroeder@hms.harvard.edu" + ], + "names": [ + "Andy Schroeder", + "aschroed" + ] + }, + "Carl Vitzthum": { + "emails": [ + "carl.vitzthum@gmail.com", + "carl_vitzthum@hms.harvard.edu" + ], + "names": [ + "Carl Vitzthum" + ] + }, + "David Michaels": { + "emails": [ + "david_michaels@hms.harvard.edu", + "dmichaels@gmail.com", + "105234079+dmichaels-harvard@users.noreply.github.com" + ], + "names": [ + "David Michaels", + "dmichaels-harvard" + ] + }, + "Douglas Rioux": { + "emails": [ + "douglas_rioux@hms.harvard.edu", + "58236592+drio18@users.noreply.github.com" + ], + "names": [ + "Douglas Rioux", + "drio18" + ] + }, + "Eric Berg": { + "emails": [ + "Eric_Berg@hms.harvard.edu" + ], + "names": [ + "Eric Berg" + ] + }, + "Jeremy Johnson": { + "emails": [ + "jeremy@softworks.com.my" + ], + "names": [ + "Jeremy Johnson", + "j1z0" + ] + }, + "Kent M Pitman": { + "emails": [ + "netsettler@users.noreply.github.com", + "kent_pitman@hms.harvard.edu" + ], + "names": [ + "Kent M Pitman", + "Kent Pitman" + ] + }, + "Koray K\u0131rl\u0131": { + "emails": [ + "KorayKirli@users.noreply.github.com", + "koray_kirli@hms.harvard.edu" + ], + "names": [ + "Koray K\u0131rl\u0131" + ] + }, + "Luisa Mercado": { + "emails": [ + "42242797+Lmercadom@users.noreply.github.com" + ], + "names": [ + "Lmercadom", + "Luisa Mercado" + ] + }, + "Sarah Reiff": { + "emails": [ + "sarah_reiff@hms.harvard.edu" + ], + "names": [ + "Sarah", + "Sarah Reiff" + ] + }, + "Soo Lee": { + "emails": [ + "duplexa@gmail.com" + ], + "names": [ + "Soo Lee", + "SooLee" + ] + }, + "Will Ronchetti": { + "emails": [ + "wrr33@cornell.edu" + ], + "names": [ + "Will Ronchetti", + "William Ronchetti" + ] + } + } +} diff --git a/dcicutils/contribution_scripts.py b/dcicutils/contribution_scripts.py new file mode 100644 index 000000000..99d818a8d --- /dev/null +++ b/dcicutils/contribution_scripts.py @@ -0,0 +1,39 @@ +import argparse + +from dcicutils.command_utils import script_catch_errors, ScriptFailure +from .contribution_utils import Contributions, PROJECT_HOME + + +EPILOG = __doc__ + + +def show_contributors(repo, exclude_fork=None, verbose=False, save_contributors=False, test=False): + contributions = Contributions(repo=repo, exclude_fork=exclude_fork, verbose=verbose) + if save_contributors: + contributions.save_contributor_data() + contributions.show_repo_contributors(error_class=ScriptFailure if test else None) + + +def show_contributors_main(*, simulated_args=None): + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. + description=(f"Show authors of a specified repository, which will be presumed" + f" to have been cloned as a subdirectory of $PROJECT_HOME ({PROJECT_HOME})"), + epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('repo', default=None, + help="name of repository to show contributors for") + parser.add_argument('--exclude', '-x', default=None, + help="name of repository that repo was forked from, whose contributors to exclude") + parser.add_argument('--save-contributors', '-s', action="store_true", default=False, + help="whether to store contributor data to CONTRIBUTORS.json") + parser.add_argument('--test', '-t', action="store_true", default=False, + help="whether to treat this as a test, erring if a cache update is needed") + parser.add_argument('--verbose', '-v', action="store_true", default=False, + help="whether to do verbose output while working") + args = parser.parse_args(args=simulated_args) + + with script_catch_errors(): + + show_contributors(repo=args.repo, exclude_fork=args.exclude, verbose=args.verbose, + save_contributors=args.save_contributors, test=args.test) diff --git a/dcicutils/contribution_utils.py b/dcicutils/contribution_utils.py new file mode 100644 index 000000000..41dd84a9f --- /dev/null +++ b/dcicutils/contribution_utils.py @@ -0,0 +1,532 @@ +import datetime +import git +import io +import json +import os +import re + +from collections import defaultdict +from dcicutils.diff_utils import DiffManager +from dcicutils.lang_utils import n_of +from dcicutils.misc_utils import PRINT, ignored, environ_bool +from typing import Dict, List, Optional, Set, Type + + +DEBUG_CONTRIBUTIONS = environ_bool("DEBUG_CONTRIBUTIONS") + +GITHUB_USER_REGEXP = re.compile('(?:[0-9]+[+])?(.*)[@]users[.]noreply[.]github[.]com') +PROJECT_HOME = os.environ.get('PROJECT_HOME', os.path.dirname(os.path.abspath(os.curdir))) + + +class GitAnalysis: + + @classmethod + def find_repo(cls, repo_name: str) -> git.Repo: + repo_path = os.path.join(PROJECT_HOME, repo_name) + repo = git.Repo(repo_path) + return repo + + @classmethod + def git_commits(cls, repo_name) -> List[git.Commit]: + repo = cls.find_repo(repo_name) + commit: git.Commit + for commit in repo.iter_commits(): + yield cls.json_for_commit(commit) + + @classmethod + def json_for_actor(cls, actor: git.Actor) -> Dict: + return { + "name": actor.name, + "email": actor.email, + } + + @classmethod + def json_for_commit(cls, commit: git.Commit) -> Dict: + return { + 'commit': commit.hexsha, + 'date': commit.committed_datetime.isoformat(), + 'author': cls.json_for_actor(commit.author), + 'coauthors': [cls.json_for_actor(co_author) for co_author in commit.co_authors], + 'message': commit.message, + } + + +class Contributor: + + @classmethod + def create(cls, *, author: git.Actor) -> 'Contributor': + return Contributor(email=author.email, name=author.name) + + def __init__(self, *, email: Optional[str] = None, name: Optional[str] = None, + emails: Optional[Set[str]] = None, names: Optional[Set[str]] = None, + primary_name: Optional[str] = None): + # Both email and name are required keyword arguments, though name is allowed to be None, + # even though email is not. The primary_name is not required, and defaults to None, so will + # be heuristically computed based on available names. + if not email and not emails: + raise ValueError("One of email= or emails= is required.") + if email and emails: + raise ValueError("Only one of email= and emails= may be provided.") + if not emails: + emails = {email} + if name and names: + raise ValueError("Only one of name= and names= may be provided.") + if name and not names: + names = {name} + self.emails: Set[str] = emails + self.names: Set[str] = names or set() + self._primary_name = primary_name + if primary_name: + self.names.add(primary_name) + + def __str__(self): + maybe_primary = f" {self._primary_name!r}" if self._primary_name else "" + emails = ",".join(sorted(map(repr, self.emails), key=lambda x: x.lower())) + names = ",".join(sorted(map(repr, self.names), key=lambda x: x.lower())) + return f"<{self.__class__.__name__}{maybe_primary} emails={emails} names={names} {id(self)}>" + + def copy(self): + return Contributor(emails=self.emails.copy(), names=self.names.copy(), primary_name=self._primary_name) + + @property + def primary_name(self): + if self._primary_name: + return self._primary_name + return sorted(self.names, key=self.name_variation_spelling_sort_key, reverse=True)[0] + + def set_primary_name(self, primary_name: str): + self.names.add(primary_name) + self._primary_name = primary_name + + def notice_mention_as(self, *, email: Optional[str] = None, name: Optional[str] = None): + if email is not None and email not in self.emails: + self.emails.add(email) + if name is not None and name not in self.names: + self.names.add(name) + + def as_dict(self): + # Note that the sort is case-sensitive alphabetic just because that's easiest here. + # Making it be case-insensitive would require a special case for non-strings, + # and all we really care about is some easy degree of determinism for testing. + data = { + "emails": sorted(self.emails), + "names": sorted(self.names), + } + return data + + @classmethod + def from_dict(cls, data: Dict, *, key: Optional[str] = None) -> 'Contributor': + emails = data["emails"] + names = data["names"] + contributor = Contributor(email=emails[0], name=names[0]) + contributor.emails = set(emails) + contributor.names = set(names) + if key: + contributor.set_primary_name(key) + return contributor + + @classmethod + def name_variation_spelling_sort_key(cls, name): + return ( + ' ' in name, # we prefer names like 'jane doe' over jdoe123 + len(name), # longer names are usually more formal; william smith vs will smith + name, # this names by alphabetical order not because one is better, but to make sort results deterministic + ) + + +ContributorIndex = Optional[Dict[str, Contributor]] + + +class BasicContributions(GitAnalysis): + + VERBOSE = False + + def __init__(self, *, repo: Optional[str] = None, + exclude_fork: Optional[str] = None, + verbose: Optional[bool] = None): + self.email_timestamps: Dict[str, datetime.datetime] = {} + self.name_timestamps: Dict[str, datetime.datetime] = {} + self.exclude_fork: Optional[str] = exclude_fork + if not repo: + # Doing it this way gets around an ambiguity about '/foo/' vs '/foo' since both + # os.path.join('/foo/', 'bar') and os.path.join('/foo', 'bar') yield '/foo/bar', + # and from there one can do os.path.basename(os.path.dirname(...)) to get 'foo' out. + cache_file = os.path.join(os.path.abspath(os.path.curdir), self.CONTRIBUTORS_CACHE_FILE) + dir = os.path.dirname(cache_file) + repo = os.path.basename(dir) + self.repo: str = repo + self.excluded_contributions = None + self.forked_at: Optional[datetime.datetime] = None + self.contributors_by_name: Optional[ContributorIndex] = None + self.contributors_by_email: Optional[ContributorIndex] = None + self.pre_fork_contributors_by_email: Optional[ContributorIndex] = None + self.pre_fork_contributors_by_name: Optional[ContributorIndex] = None + self.loaded_contributor_data = None + self.cache_discrepancies: Optional[dict] = None + self.verbose = self.VERBOSE if verbose is None else verbose + + CONTRIBUTORS_CACHE_FILE = 'CONTRIBUTORS.json' + + def contributors_json_file(self): + """ + Returns the name of the CONTRIBUTORS.json file for the repo associated with this class. + """ + return os.path.join(PROJECT_HOME, self.repo, self.CONTRIBUTORS_CACHE_FILE) + + def existing_contributors_json_file(self): + """ + Returns the name of the CONTRIBUTORS.json file for the repo associated with this class if that file exists, + or None if there is no such file. + """ + file = self.contributors_json_file() + if os.path.exists(file): + return file + else: + return None + + @classmethod + def notice_reference_time(cls, key: str, timestamp: datetime.datetime, timestamps: Dict[str, datetime.datetime]): + reference_timestamp: datetime.datetime = timestamps.get(key) + if not reference_timestamp: + timestamps[key] = timestamp + elif timestamp > reference_timestamp: + timestamps[key] = timestamp + + def email_reference_time(self, email): + return self.email_timestamps.get(email) + + def name_reference_time(self, name): + return self.name_timestamps.get(name) + + @classmethod + def contributor_values_as_dicts(cls, contributor_index: Optional[ContributorIndex]): + if contributor_index is None: + return None + else: + return { + key: contributor.as_dict() + for key, contributor in contributor_index.items() + } + + @classmethod + def contributor_values_as_objects(cls, contributor_index: Optional[Dict]): + if contributor_index is None: + return None + else: + return { + key: Contributor.from_dict(value, key=key) + for key, value in contributor_index.items() + } + + def checkpoint_state(self): + return self.as_dict() + + def as_dict(self): + data = { + "forked_at": self.forked_at.isoformat() if self.forked_at else None, + "excluded_fork": self.exclude_fork, + "pre_fork_contributors_by_name": self.contributor_values_as_dicts(self.pre_fork_contributors_by_name), + "contributors_by_name": self.contributor_values_as_dicts(self.contributors_by_name), + } + return data + + def save_contributor_data(self, filename: Optional[str] = None) -> str: + if filename is None: + filename = self.contributors_json_file() + with io.open(filename, 'w') as fp: + PRINT(json.dumps(self.as_dict(), indent=2), file=fp) + return filename + + def repo_contributor_names(self, with_email=False): + for name, contributor in self.contributors_by_name.items(): + if with_email: + yield f"{name} ({', '.join([self.pretty_email(email) for email in contributor.emails])})" + else: + yield name + + @classmethod + def pretty_email(cls, email): + m = GITHUB_USER_REGEXP.match(email) + if m: + user_name = m.group(1) + return f"{user_name}@github" + else: + return email + + @classmethod + def get_contributors_json_from_file_cache(cls, filename): + try: + with io.open(filename, 'r') as fp: + data = json.load(fp) + except Exception: + PRINT(f"Error while reading data from {filename!r}.") + raise + return data + + +class Contributions(BasicContributions): + + def __init__(self, *, repo: Optional[str] = None, + exclude_fork: Optional[str] = None, + verbose: Optional[bool] = None): + super().__init__(repo=repo, exclude_fork=exclude_fork, verbose=verbose) + existing_contributor_data_file = self.existing_contributors_json_file() + if existing_contributor_data_file: + # This will set .loaded_contributor_data and other values from CONTRIBUTORS.json + self.load_contributors_from_json_file_cache(existing_contributor_data_file) + + if exclude_fork and not self.excluded_contributions: + self.excluded_contributions = Contributions(repo=exclude_fork) + checkpoint1 = self.checkpoint_state() + self.reconcile_contributors_with_github_log() + checkpoint2 = self.checkpoint_state() + + def list_to_dict_normalizer(*, label, item): + ignored(label) + if isinstance(item, list): + return {elem: elem for elem in item} + else: + return item + + if existing_contributor_data_file: + diff_manager = DiffManager(label="contributors") + contributors1 = checkpoint1['contributors_by_name'] + contributors2 = checkpoint2['contributors_by_name'] + diffs = diff_manager.diffs(contributors1, contributors2, normalizer=list_to_dict_normalizer) + added = diffs.get('added') + changed = diffs.get('changed') + removed = diffs.get('removed') + self.cache_discrepancies = cache_discrepancies = {} + if added: + cache_discrepancies['to_add'] = added + if changed: + cache_discrepancies['to_change'] = changed + if removed: + cache_discrepancies['to_remove'] = removed + + def load_contributors_from_json_file_cache(self, filename): + self.loaded_contributor_data = data = self.get_contributors_json_from_file_cache(filename) + self.load_from_dict(data) + + if DEBUG_CONTRIBUTIONS: # pragma: no cover - debugging only + PRINT("After load_contributors_from_json_file_cache...") + PRINT(f"{n_of(self.pre_fork_contributors_by_name, 'pre-fork contributor by name')}") + PRINT(f"{n_of(self.pre_fork_contributors_by_email, 'pre-fork contributor by email')}") + PRINT(f"{n_of(self.contributors_by_name, 'contributor by name')}") + PRINT(f"{n_of(self.contributors_by_email, 'contributor by email')}") + + def reconcile_contributors_with_github_log(self): + """ + Rummages the GitHub log entries for contributors we don't know about. + That data is merged against our existing structures. + """ + if DEBUG_CONTRIBUTIONS: # pragma: no cover - debugging only + PRINT("Reconciling with git log.") + excluded_fork_contributors_by_email = (self.excluded_contributions.contributors_by_email + if self.excluded_contributions + else {}) + # excluded_fork_contributors_by_name = (self.excluded_contributions.contributors_by_name + # if self.excluded_contributions + # else {}) + fork_contributor_emails = set(excluded_fork_contributors_by_email.keys()) + + post_fork_contributors_seen = defaultdict(lambda: []) + + contributors_by_email: ContributorIndex = self.contributors_by_email or {} + git_repo = self.find_repo(repo_name=self.repo) + + def notice_author(*, author: git.Actor, date: datetime.datetime): + if self.forked_at: + if date < self.forked_at: + return + # raise Exception("Commits are out of order.") + elif author.email not in fork_contributor_emails: + PRINT(f"Forked {self.repo} at {date} by {author.email}") + self.forked_at = date + + if self.forked_at and date >= self.forked_at: + if author.email in fork_contributor_emails: + # PRINT(f"Post-fork contribution from {author.email} ({date})") + post_fork_contributors_seen[author.email].append(date) + self.notice_reference_time(key=author.email, timestamp=date, timestamps=self.email_timestamps) + self.notice_reference_time(key=author.name, timestamp=date, timestamps=self.name_timestamps) + + contributor_by_email = contributors_by_email.get(author.email) + if contributor_by_email: # already exists, so update it + contributor_by_email.notice_mention_as(email=author.email, name=author.name) + else: # need to create it new + contributor_by_email = Contributor.create(author=author) + contributors_by_email[author.email] = contributor_by_email + else: + # print("skipped") + pass + + n = 0 + for commit in reversed(list(git_repo.iter_commits())): + n += 1 + commit_date = commit.committed_datetime + notice_author(author=commit.author, date=commit_date) + for co_author in commit.co_authors: + notice_author(author=co_author, date=commit_date) + if DEBUG_CONTRIBUTIONS: + PRINT(f"{n_of(n, 'commit')} processed.") + + for email, dates in post_fork_contributors_seen.items(): + when = str(dates[0].date()) + if len(dates) > 1: + when += f" to {dates[-1].date()}" + PRINT(f"{n_of(dates, 'post-fork commit')} seen for {email} ({when}).") + + contributors_by_name: ContributorIndex = {} + + for contributor_by_email in contributors_by_email.values(): + self.traverse(root=contributor_by_email, + cursor=contributor_by_email, + contributors_by_email=contributors_by_email, + contributors_by_name=contributors_by_name) + for name in list(contributor_by_email.names): + contributors_by_name[name] = contributor_by_email + for email in list(contributor_by_email.emails): + contributors_by_email[email] = contributor_by_email + + # Note that the name table is somewhat unified, merging related cells, but the email table isn't. + # In part that's because I was lazy, but also we don't really need the email table to be so careful. + # It's the human names that really matter. + # if not self.pre_fork_contributors_by_name: + # self.pre_fork_contributors_by_name = excluded_fork_contributors_by_name + + # if not self.pre_fork_contributors_by_email: + # self.pre_fork_contributors_by_email = excluded_fork_contributors_by_email + + self.contributors_by_name = self.contributor_index_by_primary_name(contributors_by_name) + self.contributors_by_email = contributors_by_email + + if DEBUG_CONTRIBUTIONS: # pragma: no cover - debugging only + PRINT("After reconcile_contributors_with_github_log...") + PRINT(f"{n_of(self.pre_fork_contributors_by_name, 'pre-fork contributor by name')}") + PRINT(f"{n_of(self.pre_fork_contributors_by_email, 'pre-fork contributor by email')}") + PRINT(f"{n_of(self.contributors_by_name, 'contributor by name')}") + PRINT(f"{n_of(self.contributors_by_email, 'contributor by email')}") + + @classmethod + def contributor_index_by_primary_name(cls, contributors_by_name: ContributorIndex) -> ContributorIndex: + """ + Given a by-name contributor index: + + * Makes sure that all contributors have only one name, indexed by the contributor's primary name + * Sorts the resulting index using a case-insensitive alphabetic sort + + and then returns the result. + + :param contributors_by_name: a contributor index indexed by human name + :return: a contributor index + """ + seen = set() + nicknames_seen = set() + contributor_items = [] + contributors = {} + for name, contributor in contributors_by_name.items(): + if contributor not in seen: + for nickname in contributor.names: + if nickname in nicknames_seen: + raise Exception(f"Name improperly shared between {contributor}" + f" and {contributors_by_name[nickname]}") + nicknames_seen.add(nickname) + contributor_items.append((contributor.primary_name, contributor)) + seen.add(contributor) + for name, contributor in sorted(contributor_items, + # Having selected the longest names, now sort names ignoring case + key=lambda pair: pair[0].lower()): + contributors[name] = contributor + return contributors + + @classmethod + def traverse(cls, + root: Contributor, + cursor: Optional[Contributor], + contributors_by_email: ContributorIndex, + contributors_by_name: ContributorIndex, + seen: Optional[Set[Contributor]] = None): + if seen is None: + seen = set() + if cursor in seen: # It's slightly possible that a person has a name of None that slipped in. Ignore that. + return + seen.add(cursor) + for name in list(cursor.names): + root.names.add(name) + for email in list(cursor.emails): + root.emails.add(email) + for name in list(cursor.names): + contributor = contributors_by_name.get(name) + if contributor and contributor not in seen: + cls.traverse(root=root, cursor=contributor, contributors_by_email=contributors_by_email, + contributors_by_name=contributors_by_name, seen=seen) + for email in list(cursor.emails): + contributor = contributors_by_email.get(email) + if contributor and contributor not in seen: + cls.traverse(root=root, cursor=contributor, contributors_by_email=contributors_by_email, + contributors_by_name=contributors_by_name, seen=seen) + + def load_from_dict(self, data: Dict): + forked_at: Optional[str] = data.get('forked_at') + excluded_fork = data.get('excluded_fork') + self.forked_at: Optional[datetime.datetime] = (None + if forked_at is None + else datetime.datetime.fromisoformat(forked_at)) + self.exclude_fork = excluded_fork + + fork_contributors_by_name_json = data.get('pre_fork_contributors_by_name') or {} + fork_contributors_by_name = self.contributor_values_as_objects(fork_contributors_by_name_json) + self.pre_fork_contributors_by_name = fork_contributors_by_name + fork_contributors_by_email_json = data.get('pre_fork_contributors_by_email') or {} + if fork_contributors_by_email_json: + self.pre_fork_contributors_by_email = self.contributor_values_as_objects(fork_contributors_by_email_json) + else: + self.pre_fork_contributors_by_email = self.by_email_from_by_name(fork_contributors_by_name) + + contributors_by_name_json = data.get('contributors_by_name', {}) + self.contributors_by_name = contributors_by_name = self.contributor_values_as_objects(contributors_by_name_json) + contributors_by_email_json = data.get('contributors_by_email', {}) + if contributors_by_email_json: + self.contributors_by_email = self.contributor_values_as_objects(contributors_by_email_json) + else: + self.contributors_by_email = self.by_email_from_by_name(contributors_by_name) + + @classmethod + def by_email_from_by_name(cls, contributors_by_email_json): + result = {} + seen = set() + for email_key, entry in contributors_by_email_json.items(): + ignored(email_key) + seen_key = id(entry) + if seen_key in seen: + continue + seen.add(seen_key) + for email in entry.get("emails", []) if isinstance(entry, dict) else entry.emails: + if result.get(email): + raise Exception(f"email address {email} is used more than once.") + result[email] = entry + return result + + def show_repo_contributors(self, analyze_discrepancies: bool = True, with_email: bool = True, + error_class: Optional[Type[BaseException]] = None): + for author_name in self.repo_contributor_names(with_email=with_email): + PRINT(author_name) + if analyze_discrepancies: + file = self.existing_contributors_json_file() + if not file: + message = f"Need to create a {self.CONTRIBUTORS_CACHE_FILE} file for {self.repo}." + if error_class: + raise error_class(message) + else: + PRINT(message) + elif self.cache_discrepancies: + message = "There are contributor cache discrepancies." + PRINT(f"===== {message.rstrip('.').upper()} =====") + for action, items in self.cache_discrepancies.items(): + action: str + PRINT(f"{action.replace('_', ' ').title()}:") + for item in items: + PRINT(f" * {item}") + if error_class: + raise error_class(message) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index b2124a43d..698379470 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -2,10 +2,11 @@ import datetime import io import json -import logging +# import logging import os import re import subprocess +import warnings try: import piplicenses @@ -30,8 +31,8 @@ from dcicutils.misc_utils import PRINT, get_error_message, local_attrs -logging.basicConfig() -logger = logging.getLogger(__name__) +# logging.basicConfig() +# logger = logging.getLogger(__name__) _FRAMEWORK = 'framework' _LANGUAGE = 'language' @@ -270,7 +271,7 @@ def report(message): if analysis: analysis.miscellaneous.append(message) else: - logger.warning(message) + warnings.warn(message) parsed = cls.parse_simple_license_file(filename=filename) if check_license_title: license_title = parsed['license-title'] @@ -488,11 +489,12 @@ class MyOrgLicenseChecker(LicenseChecker): cls.analyze_license_file(analysis=analysis) cls.show_unacceptable_licenses(analysis=analysis) if analysis.unexpected_missing: - logger.warning(there_are(analysis.unexpected_missing, kind='unexpectedly missing license', punctuate=True)) + warnings.warn(there_are(analysis.unexpected_missing, kind='unexpectedly missing license', punctuate=True)) if analysis.no_longer_missing: - logger.warning(there_are(analysis.no_longer_missing, kind='no-longer-missing license', punctuate=True)) + # This is not so major as to need a warning, but it's still something that should show up somewhere. + PRINT(there_are(analysis.no_longer_missing, kind='no-longer-missing license', punctuate=True)) for message in analysis.miscellaneous: - logger.warning(message) + warnings.warn(message) if analysis.unacceptable: raise LicenseAcceptabilityCheckFailure(unacceptable_licenses=analysis.unacceptable) @@ -702,6 +704,11 @@ class C4InfrastructureLicenseChecker(LicenseChecker): # Ref: https://github.com/getsentry/responses/blob/master/LICENSE 'responses', + # This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault + # calls itself internally.. In any case, it's under MIT license and OK. + # Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt + 'snovault', + # PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" # Ref: https://pypi.org/project/supervisor/ # In fact, though, the license is a bit more complicated, though apparently still permissive. @@ -710,7 +717,7 @@ class C4InfrastructureLicenseChecker(LicenseChecker): # This seems to be a BSD-3-Clause-Modification license. # Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt - 'translationstring', + # 'translationstring', # This seems to be a BSD-3-Clause-Modification license. # Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt diff --git a/dcicutils/qa_checkers.py b/dcicutils/qa_checkers.py index 61a79cc77..291135bd1 100644 --- a/dcicutils/qa_checkers.py +++ b/dcicutils/qa_checkers.py @@ -8,6 +8,7 @@ from collections import defaultdict from typing import Optional, List, Dict, Type from typing_extensions import Literal +from .contribution_utils import Contributions from .lang_utils import conjoined_list, n_of, there_are from .misc_utils import PRINT, remove_prefix, remove_suffix, getattr_customized, CustomizableProperty @@ -436,3 +437,11 @@ def summarize(problems): detail += f"\n In {file}, {summarize(matches)}." message = f"{n_of(n, 'problem')} detected:" + detail raise AssertionError(message) + + +class ContributionsChecker: + + @classmethod + def validate(cls): + contributions = Contributions() # no repo specified, so use current directory + contributions.show_repo_contributors(error_class=AssertionError) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index b18d0a952..7fdaba7ea 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -37,6 +37,20 @@ command_utils :members: +contribution_scripts +^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: dcicutils.contribution_scripts + :members: + + +contribution_utils +^^^^^^^^^^^^^^^^^^ + +.. automodule:: dcicutils.contribution_utils + :members: + + creds_utils ^^^^^^^^^^^ diff --git a/poetry.lock b/poetry.lock index da9155f6b..d428e79a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -70,18 +70,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.26.101" +version = "1.28.11" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.26.101-py3-none-any.whl", hash = "sha256:5f5279a63b359ba8889e9a81b319e745b14216608ffb5a39fcbf269d1af1ea83"}, - {file = "boto3-1.26.101.tar.gz", hash = "sha256:670ae4d1875a2162e11c6e941888817c3e9cf1bb9a3335b3588d805b7d24da31"}, + {file = "boto3-1.28.11-py3-none-any.whl", hash = "sha256:e24460d50001b517c6734dcf1c879feb43aa2062d88d9bdbb8703c986cb05941"}, + {file = "boto3-1.28.11.tar.gz", hash = "sha256:0fe7a35cf0041145c8eefebd3ae2ddf41baed62d7c963e5042b8ed8c297f648f"}, ] [package.dependencies] -botocore = ">=1.29.101,<1.30.0" +botocore = ">=1.31.11,<1.32.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -386,14 +386,14 @@ xray = ["mypy-boto3-xray (>=1.18.18)"] [[package]] name = "botocore" -version = "1.29.101" +version = "1.31.11" description = "Low-level, data-driven core of boto 3." category = "main" optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.29.101-py3-none-any.whl", hash = "sha256:60c7a7bf8e2a288735e507007a6769be03dc24815f7dc5c7b59b12743f4a31cf"}, - {file = "botocore-1.29.101.tar.gz", hash = "sha256:7bb60d9d4c49500df55dfb6005c16002703333ff5f69dada565167c8d493dfd5"}, + {file = "botocore-1.31.11-py3-none-any.whl", hash = "sha256:d3cbffe554c9a1ba2ac6973734c43c21b8e7985a2ac4a4c31a09811b8029445c"}, + {file = "botocore-1.31.11.tar.gz", hash = "sha256:b17ff973bb70b02b227928c2abe4992f1cfc46d13aee0228516c8f32572b88c6"}, ] [package.dependencies] @@ -402,7 +402,7 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.16.9)"] +crt = ["awscrt (==0.16.26)"] [[package]] name = "botocore-stubs" @@ -820,14 +820,14 @@ smmap = ">=3.0.1,<6" [[package]] name = "gitpython" -version = "3.1.31" +version = "3.1.32" description = "GitPython is a Python library used to interact with Git repositories" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "GitPython-3.1.31-py3-none-any.whl", hash = "sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d"}, - {file = "GitPython-3.1.31.tar.gz", hash = "sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573"}, + {file = "GitPython-3.1.32-py3-none-any.whl", hash = "sha256:e3d59b1c2c6ebb9dfa7a184daf3b6dd4914237e7488a1730a6d8f6f5d0b4187f"}, + {file = "GitPython-3.1.32.tar.gz", hash = "sha256:8d9b8cb1e80b9735e8717c9362079d3ce4c6e5ddeebedd0361b228c3a67a62f6"}, ] [package.dependencies] diff --git a/pyproject.toml b/pyproject.toml index d79690d4f..2715c6ac4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.6.0.2b3" +version = "7.6.0.2b7" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -80,15 +80,18 @@ pytest-runner = ">=5.1" [tool.poetry.scripts] publish-to-pypi = "dcicutils.scripts.publish_to_pypi:main" - +show-contributors = "dcicutils.contribution_scripts:show_contributors_main" [tool.pytest.ini_options] addopts = "--basetemp=/tmp/pytest" redis_exec = "/usr/local/bin/redis-server" filterwarnings = [ + # TODO: These next two are about use of the Version class: "ignore:distutils Version classes are deprecated. Use packaging.version instead.:DeprecationWarning", + "ignore:Setuptools is replacing distutils.:UserWarning", # This is a pip-licenses problem (still present in 3.5.5): "ignore:pkg_resources is deprecated as an API:DeprecationWarning", + "ignore:Boto3 will no longer support Python 3.7 starting December 13, 2023.*:", ] markers = [ "working: test should work", diff --git a/test/test_contribution_scripts.py b/test/test_contribution_scripts.py new file mode 100644 index 000000000..8fa040c1e --- /dev/null +++ b/test/test_contribution_scripts.py @@ -0,0 +1,63 @@ +import pytest + +from dcicutils import contribution_scripts as contribution_scripts_module +from dcicutils.command_utils import ScriptFailure +from dcicutils.contribution_scripts import show_contributors, show_contributors_main +from unittest import mock + + +class MockContributions: + + def __init__(self, repo, exclude_fork=None, verbose=False): + self.repo = repo + self.exclude_fork = exclude_fork + self.verbose = verbose + + +def test_show_contributors(): + with mock.patch.object(contribution_scripts_module, "Contributions") as mock_contributions: + contributions_object = mock.MagicMock() + mock_contributions.return_value = contributions_object + + show_contributors(repo='my-repo') + + assert contributions_object.save_contributor_data.call_count == 0 + + mock_contributions.assert_called_once_with(repo='my-repo', exclude_fork=None, verbose=False) + contributions_object.show_repo_contributors.assert_called_once_with(error_class=None) + + mock_contributions.reset_mock() + contributions_object = mock.MagicMock() + mock_contributions.return_value = contributions_object + + show_contributors(repo='another-repo', exclude_fork='whatever', save_contributors=True, verbose=True, test=True) + + mock_contributions.assert_called_once_with(repo='another-repo', exclude_fork='whatever', verbose=True) + contributions_object.show_repo_contributors.assert_called_once_with(error_class=ScriptFailure) + contributions_object.save_contributor_data.assert_called_once_with() + + +def test_show_contributors_main(): + + with mock.patch.object(contribution_scripts_module, "show_contributors") as mock_show_contributors: + + with pytest.raises(SystemExit) as exc: + + show_contributors_main(simulated_args=['some-repo']) + + assert exc.value.code == 0 + + mock_show_contributors.assert_called_once_with(repo='some-repo', exclude_fork=None, verbose=False, + save_contributors=False, test=False) + + mock_show_contributors.reset_mock() + + with pytest.raises(SystemExit) as exc: + + show_contributors_main(simulated_args=['my-repo', '--exclude', 'their-repo', + '--save-contributors', '--test', '--verbose']) + + assert exc.value.code == 0 + + mock_show_contributors.assert_called_once_with(repo='my-repo', exclude_fork='their-repo', + save_contributors=True, test=True, verbose=True) diff --git a/test/test_contribution_utils.py b/test/test_contribution_utils.py new file mode 100644 index 000000000..fee2def01 --- /dev/null +++ b/test/test_contribution_utils.py @@ -0,0 +1,630 @@ +import contextlib +import datetime +import git +import io +import json +import os +import pytest +import re + +from dcicutils import contribution_utils as contribution_utils_module +from dcicutils.contribution_utils import Contributor, BasicContributions, Contributions, GitAnalysis +from dcicutils.misc_utils import make_counter, file_contents # , override_environ +from dcicutils.qa_utils import MockId, MockFileSystem, printed_output +from typing import List, Optional +from unittest import mock + + +class MockGitActor: + def __init__(self, name: Optional[str], email: str): + self.name = name + self.email = email + + +class MockGitCommit: + def __init__(self, hexsha: str, committed_datetime: str, author: dict, + message: str, co_authors: Optional[List[dict]] = None): + self.hexsha = hexsha + self.committed_datetime = datetime.datetime.fromisoformat(committed_datetime) + self.author = MockGitActor(**author) + self.co_authors = [MockGitActor(**co_author) for co_author in (co_authors or [])] + self.message = message + + +class MockGitRepo: + + def __init__(self, name, path, mocked_commits=None): + self.name = name + self.path = path + self.mocked_commits = mocked_commits or [] + + def iter_commits(self): + for json_data in self.mocked_commits: + yield MockGitCommit(**json_data) + + +class MockGitModule: + + def __init__(self, mocked_commits=None): + if mocked_commits is None: + mocked_commits = {} + self._mocked_commits = mocked_commits + + def Repo(self, path): + repo_name = os.path.basename(path) + return MockGitRepo(name=repo_name, path=path, mocked_commits=self._mocked_commits.get(repo_name, [])) + + Actor = MockGitActor + + +SAMPLE_USER_HOME = "/home/jdoe" +SAMPLE_PROJECT_HOME = f"{SAMPLE_USER_HOME}/repos" +SAMPLE_FOO_HOME = f"{SAMPLE_PROJECT_HOME}/foo" + + +@contextlib.contextmanager +def git_context(project_home=SAMPLE_PROJECT_HOME, mocked_commits=None): + with mock.patch.object(contribution_utils_module, "git", MockGitModule(mocked_commits=mocked_commits)): + with mock.patch.object(contribution_utils_module, "PROJECT_HOME", project_home): + yield + + +def test_git_analysis_find_repo(): + + with git_context(): + foo_repo = GitAnalysis.find_repo("foo") + assert isinstance(foo_repo, MockGitRepo) + assert foo_repo.name == 'foo' + assert foo_repo.path == SAMPLE_FOO_HOME + assert foo_repo.mocked_commits == [] + + +def test_git_analysis_git_commits(): + + with git_context(mocked_commits={"foo": [ + { + "hexsha": "aaaa", + "committed_datetime": "2020-01-01 01:23:45", + "author": {"name": "Jdoe", "email": "jdoe@foo"}, + "message": "something" + }, + { + "hexsha": "bbbb", + "committed_datetime": "2020-01-02 12:34:56", + "author": {"name": "Sally", "email": "ssmith@foo"}, + "message": "something else" + } + ]}): + foo_repo = GitAnalysis.find_repo('foo') + foo_commits = list(foo_repo.iter_commits()) + assert len(foo_commits) == 2 + assert all(isinstance(commit, MockGitCommit) for commit in foo_commits) + assert GitAnalysis.json_for_actor(foo_commits[0].author) == {'name': 'Jdoe', 'email': 'jdoe@foo'} + assert GitAnalysis.json_for_actor(foo_commits[1].author) == {'name': 'Sally', 'email': 'ssmith@foo'} + assert foo_commits[0].hexsha == 'aaaa' + assert foo_commits[1].hexsha == 'bbbb' + + assert [GitAnalysis.json_for_commit(commit) for commit in foo_commits] == list(GitAnalysis.git_commits('foo')) + + +def test_git_analysis_iter_commits_scenario(): # Tests .iter_commits, .json_for_commit, .json_for_actor, .git_commits + + with git_context(mocked_commits={"foo": [ + { + "hexsha": "aaaa", + "committed_datetime": "2020-01-01T01:23:45-05:00", + "author": {"name": "Jdoe", "email": "jdoe@foo"}, + "message": "something" + }, + { + "hexsha": "bbbb", + "committed_datetime": "2020-01-02T12:34:56-05:00", + "author": {"name": "Sally", "email": "ssmith@foo"}, + "co_authors": [{"name": "William Simmons", "email": "bill@someplace"}], + "message": "something else" + } + ]}): + foo_repo = GitAnalysis.find_repo("foo") + foo_commits_as_json = [GitAnalysis.json_for_commit(commit) for commit in foo_repo.iter_commits()] + assert foo_commits_as_json == [ + { + 'author': {'email': 'jdoe@foo', 'name': 'Jdoe'}, + 'coauthors': [], + 'commit': 'aaaa', + 'date': '2020-01-01T01:23:45-05:00', + 'message': 'something' + }, + { + 'author': {'email': 'ssmith@foo', 'name': 'Sally'}, + 'coauthors': [{'email': 'bill@someplace', 'name': 'William Simmons'}], + 'commit': 'bbbb', + 'date': '2020-01-02T12:34:56-05:00', + 'message': 'something else' + } + ] + + assert foo_commits_as_json == list(GitAnalysis.git_commits('foo')) + + +def test_contributor_create(): + + # This should execute without error + c1 = Contributor(name="John Doe", email="john@whatever") + + with mock.patch.object(git, "Actor", MockGitActor): + + c2 = Contributor.create(author=git.Actor(name="John Doe", email="john@whatever")) + + assert c1.as_dict() == c2.as_dict() + + +def test_contributor_str(): + + with mock.patch.object(contribution_utils_module, "id", MockId(1000)): + + john = Contributor(names={"John Doe", "jdoe"}, email="john@whatever") + + assert str(john) == "" + + jdoe = Contributor(names={"John Doe", "jdoe"}, primary_name="jdoe", email="john@whatever") + + assert str(jdoe) == "" + + # Edge cases... + + with pytest.raises(ValueError) as exc: + Contributor() + assert str(exc.value) == "One of email= or emails= is required." + + with pytest.raises(ValueError) as exc: + Contributor(email='foo@bar', emails={'foo@bar'}) + assert str(exc.value) == 'Only one of email= and emails= may be provided.' + + with pytest.raises(ValueError) as exc: + Contributor(name='John', names={'John'}, email="foo@bar") + assert str(exc.value) == 'Only one of name= and names= may be provided.' + + +def test_contributor_set_primary_name(): + + john = Contributor(names={"John Doe", "jdoe"}, email="john@whatever") + assert john.primary_name == "John Doe" + + john.names.add("John Q Doe") + assert john.primary_name == "John Q Doe" + + john.set_primary_name("John Doe") + assert john.primary_name == "John Doe" + + john.names.add("John Quincy Doe") + assert john.primary_name == "John Doe" + + assert "Quincy" not in john.names + john.set_primary_name("Quincy") + assert john.primary_name == "Quincy" + assert "Quincy" in john.names + + +def test_contributor_copy(): + + john = Contributor(name="john", email="john@foo") + jack = john.copy() + + assert john.names == {"john"} + assert john.emails == {"john@foo"} + assert john.primary_name == "john" + + jack.names.add("jack") + jack.emails.add("jack@bar") + jack.set_primary_name("jack") + + assert john.names == {"john"} + assert john.emails == {"john@foo"} + assert john.primary_name == "john" + + assert jack.names == {"john", "jack"} + assert jack.emails == {"john@foo", "jack@bar"} + assert jack.primary_name == "jack" + + +def test_contributor_primary_name(): + + email_counter = make_counter() + + def make_contributor(names, primary_name=None): + some_email = f"user{email_counter()}@something.foo" # unique but we don't care about the specific value + return Contributor(names=set(names), email=some_email, primary_name=primary_name) + + assert make_contributor({"John Doe", "jdoe"}).primary_name == "John Doe" + assert make_contributor({"jdoe", "John Doe"}).primary_name == "John Doe" + assert make_contributor({"jdoe", "John Doe", "jdoe123456789"}).primary_name == "John Doe" + assert make_contributor({"jdoe123456789", "John Doe", "jdoe"}).primary_name == "John Doe" + + assert make_contributor({"jdoe123456789", "John Doe", "John Q Doe", "jdoe"}).primary_name == "John Q Doe" + assert make_contributor({"jdoe123456789", "John Q Doe", "John Doe", "jdoe"}).primary_name == "John Q Doe" + + assert make_contributor({"John Doe", "jdoe"}, primary_name="jdoe").primary_name == "jdoe" + assert make_contributor({"jdoe", "John Doe"}, primary_name="jdoe").primary_name == "jdoe" + assert make_contributor({"jdoe", "John Doe", "jdoe123456789"}, primary_name="jdoe").primary_name == "jdoe" + assert make_contributor({"jdoe123456789", "John Doe", "jdoe"}, primary_name="jdoe").primary_name == "jdoe" + + +def test_contributor_from_dict(): + + joe_json = {'names': ["Joe"], "emails": ["joe@wherever"]} + + joe_obj = Contributor.from_dict(joe_json) + + assert isinstance(joe_obj, Contributor) + assert list(joe_obj.emails) == joe_json['emails'] + assert list(joe_obj.names) == joe_json['names'] + + assert joe_obj.as_dict() == joe_json + + joe_obj2 = Contributor.from_dict(joe_json, key="Joseph") + assert joe_obj.names != joe_obj2.names + assert joe_obj.names | {'Joseph'} == joe_obj2.names + + +def test_contributor_notice_mention_as(): + + john = Contributor(name="john", email="john@foo") + + assert john.names == {"john"} + assert john.emails == {"john@foo"} + + john.notice_mention_as(name="jack", email="john@foo") + + assert john.names == {"john", "jack"} + assert john.emails == {"john@foo"} + + john.notice_mention_as(name="john", email="jack@bar") + + assert john.names == {"john", "jack"} + assert john.emails == {"john@foo", "jack@bar"} + + john.notice_mention_as(name="john", email="john@foo") + + assert john.names == {"john", "jack"} + assert john.emails == {"john@foo", "jack@bar"} + + +def test_contributor_as_dict(): + + john = Contributor(name="john", email="john@foo") + + assert john.as_dict() == { + "names": ["john"], + "emails": ["john@foo"] + } + + john.notice_mention_as(name="jack", email="john@foo") + + assert john.as_dict() == { + "names": ["jack", "john"], + "emails": ["john@foo"] + } + + john.notice_mention_as(name="Richard", email="john@foo") + + assert john.as_dict() == { + # As it happens, our sort is case-sensitive. See notes in source, but that's just because it was easiest. + # This test is just to notice on a simple example if that changes, since bigger examples could be harder + # to debug. -kmp 27-Jul-2023 + "names": ["Richard", "jack", "john"], + "emails": ["john@foo"] + } + + +def test_contributor_values_as_objects(): + + assert Contributions.contributor_values_as_objects(None) is None + + idx_json = {'jdoe': {'names': ['jdoe'], 'emails': ['jdoe@somewhere']}} + idx = Contributions.contributor_values_as_objects(idx_json) + jdoe = idx['jdoe'] + + assert isinstance(jdoe, Contributor) + assert jdoe.names == set(idx_json['jdoe']['names']) + assert jdoe.emails == set(idx_json['jdoe']['emails']) + + +def test_contributor_values_as_dicts(): + + assert Contributions.contributor_values_as_dicts(None) is None + + jdoe = Contributor(names={'jdoe'}, emails={'jdoe@somewhere'}) + idx = {'jdoe': jdoe} + idx_json = Contributions.contributor_values_as_dicts(idx) + jdoe_json = idx_json['jdoe'] + + assert isinstance(jdoe_json, dict) + assert set(jdoe_json['names']) == jdoe.names + assert set(jdoe_json['emails']) == jdoe.emails + + +def test_contributor_index_by_primary_name(): + + john = Contributor(names={"John Doe", "jdoe"}, emails={"jdoe@a.foo"}) + jane = Contributor(names={"jsmith", "Jane Smith"}, emails={"jsmith@b.foo"}) + + idx1 = { + "John Doe": john, + "Jane Smith": jane, + "jdoe": john, + "jsmith": jane, + } + + assert list(Contributions.contributor_index_by_primary_name(idx1).items()) == [ + ("Jane Smith", jane), + ("John Doe", john), + ] + + idx2 = { + "jdoe": john, + "jsmith": jane, + } + + assert list(Contributions.contributor_index_by_primary_name(idx2).items()) == [ + ("Jane Smith", jane), + ("John Doe", john), + ] + + idx3 = { + "John Doe": john, + "jdoe": john.copy(), # if the info is here twice, it must be in a shared pointer, not a separate object + "jsmith": jane, + } + + with pytest.raises(Exception) as exc: + # This should notice the improper duplication. + Contributions.contributor_index_by_primary_name(idx3) + + assert str(exc.value).startswith("Name improperly shared") + + +def test_contributions_by_name_from_by_email(): + + email_a = "a@somewhere" + email_alpha = "alpha@somewhere" + + email_b = "b@somewhere" + email_beta = "beta@somewhere" + + name_ajones = "ajones" + name_art = "Art Jones" + + name_bsmith = "bsmith" + name_betty = "Betty Smith" + + art_jones = Contributor(names={name_art, name_ajones}, emails={email_a, email_alpha}) + betty_smith = Contributor(names={name_betty, name_bsmith}, emails={email_b, email_beta}) + + # We should really just need one entry per each person + + by_name_contributors_short = { + name_art: art_jones, + name_betty: betty_smith, + } + + assert Contributions.by_email_from_by_name(by_name_contributors_short) == { + email_a: art_jones, + email_alpha: art_jones, + email_b: betty_smith, + email_beta: betty_smith, + } + + # It's OK for there to be more than one entry, as long as the entries share a single contributor object + # (or contributor json dictionary) + + by_name_contributors = { + name_ajones: art_jones, + name_art: art_jones, + name_bsmith: betty_smith, + name_betty: betty_smith, + } + + assert Contributions.by_email_from_by_name(by_name_contributors) == { + email_a: art_jones, + email_alpha: art_jones, + email_b: betty_smith, + email_beta: betty_smith, + } + + # It works for the targets of the dictionary to be either Contributors or JSON represented Contributors. + + art_jones_json = art_jones.as_dict() + betty_smith_json = betty_smith.as_dict() + + by_name_json = { + name_ajones: art_jones_json, + name_art: art_jones_json, + name_bsmith: betty_smith_json, + name_betty: betty_smith_json, + } + + assert Contributions.by_email_from_by_name(by_name_json) == { + email_a: art_jones_json, + email_alpha: art_jones_json, + email_b: betty_smith_json, + email_beta: betty_smith_json, + } + + # You can't have conflicts between email addresses. That's supposed to have been resolved earlier by unification. + + by_name_json_with_dups = { + "Art Jones": {"names": ["Art Jones"], "emails": ["ajones@somewhere"]}, + "Arthur Jones": {"names": ["Arthur Jones"], "emails": ["ajones@somewhere"]} + } + + with pytest.raises(Exception) as exc: + Contributions.by_email_from_by_name(by_name_json_with_dups) + assert str(exc.value) == "email address ajones@somewhere is used more than once." + + +def test_contributions_traverse_terminal_node(): + + mariela = Contributor(names={'mari', 'mariela'}, emails={'mariela@somewhere', 'mari@elsewhere'}) + mari = Contributor(names={'mari'}, emails={'mariela@somewhere', 'mari@elsewhere'}) + + seen = {mari, mariela} + originally_seen = seen.copy() + + Contributions.traverse(root=mariela, contributors_by_name={}, contributors_by_email={}, + # We're testing the cursor=None case + seen=seen, cursor=mari) + + assert seen == originally_seen + + +def test_contributions_pretty_email(): + + assert Contributions.pretty_email('joe@foo.com') == 'joe@foo.com' + assert Contributions.pretty_email('joe@users.noreply.github.com') == 'joe@github' + assert Contributions.pretty_email('12345+joe@users.noreply.github.com') == 'joe@github' + + +def test_notice_reference_time(): + + timestamps = {} + timestamp0 = datetime.datetime(2020, 7, 1, 12, 0, 0) + timestamp1 = datetime.datetime(2020, 7, 1, 12, 0, 1) + timestamp2 = datetime.datetime(2020, 7, 1, 12, 0, 2) + key = 'joe' + + Contributions.notice_reference_time(key=key, timestamp=timestamp1, timestamps=timestamps) + assert timestamps == {key: timestamp1} + + Contributions.notice_reference_time(key=key, timestamp=timestamp0, timestamps=timestamps) + assert timestamps == {key: timestamp1} + + Contributions.notice_reference_time(key=key, timestamp=timestamp2, timestamps=timestamps) + assert timestamps == {key: timestamp2} + + +def test_existing_contributors_json_file(): + + mfs = MockFileSystem() + + contributions = BasicContributions(repo='foo') # don't need all the git history, etc. loaded for this test + + with mfs.mock_exists_open_remove(): + + assert contributions.existing_contributors_json_file() is None + + cache_file = contributions.contributors_json_file() + print("cache_file=", cache_file) + with io.open(cache_file, 'w') as fp: + fp.write('something') + + assert contributions.existing_contributors_json_file() == cache_file + + +def test_contributions_email_reference_time(): + + contributions = BasicContributions() + now = datetime.datetime.now() + then = now - datetime.timedelta(seconds=10) + contributions.email_timestamps = {'foo@somewhere': now, 'bar@elsewhere': then} + + assert contributions.email_reference_time('foo@somewhere') == now + assert contributions.email_reference_time('bar@elsewhere') == then + assert contributions.email_reference_time('baz@anywhere') is None + + +def test_contributions_name_reference_time(): + + contributions = BasicContributions() + now = datetime.datetime.now() + then = now - datetime.timedelta(seconds=10) + contributions.name_timestamps = {'foo': now, 'bar': then} + + assert contributions.name_reference_time('foo') == now + assert contributions.name_reference_time('bar') == then + assert contributions.name_reference_time('baz') is None + + +def test_file_cache_error_reporting(): + + mfs = MockFileSystem() + with mfs.mock_exists_open_remove(): + + with io.open(Contributions.CONTRIBUTORS_CACHE_FILE, 'w') as fp: + fp.write("{bad json}") + + with printed_output() as printed: + with pytest.raises(json.JSONDecodeError) as exc: + Contributions.get_contributors_json_from_file_cache(Contributions.CONTRIBUTORS_CACHE_FILE,) + assert re.match("Expecting.*line 1 column 2.*", str(exc.value)) + assert printed.lines == ["Error while reading data from 'CONTRIBUTORS.json'."] + + +def test_save_contributor_data(): + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove_abspath_getcwd_chdir(): + + contributions = BasicContributions() + cache_file = contributions.save_contributor_data() + + assert file_contents(cache_file) == ( + '{\n' + ' "forked_at": null,\n' + ' "excluded_fork": null,\n' + ' "pre_fork_contributors_by_name": null,\n' + ' "contributors_by_name": null\n' + '}\n' + ) + + cache_file_2 = contributions.save_contributor_data('some.file') + assert cache_file_2 == 'some.file' + assert file_contents(cache_file_2) == ( + '{\n' + ' "forked_at": null,\n' + ' "excluded_fork": null,\n' + ' "pre_fork_contributors_by_name": null,\n' + ' "contributors_by_name": null\n' + '}\n' + ) + + +def test_repo_contributor_names(): + + contributions = BasicContributions() + tony = Contributor(names={"Tony", "Anthony"}, emails={"tony@foo"}) + juan = Contributor(names={"Juan"}, emails={"juan@foo"}) + contributions.contributors_by_name = { + "Tony": tony, + "Juan": juan, + } + assert list(contributions.repo_contributor_names(with_email=False)) == [ + "Tony", + "Juan", + ] + assert list(contributions.repo_contributor_names(with_email=True)) == [ + "Tony (tony@foo)", + "Juan (juan@foo)", + ] + # We assume de-duplication has already occurred, so something like this will happen. + # This behavior is not a feature, but then again, there's no reason to check/redo what should be done earlier. + contributions.contributors_by_name = { + "Tony": tony, + "Juan": juan, + "Anthony": tony, + } + assert list(contributions.repo_contributor_names(with_email=True)) == [ + "Tony (tony@foo)", + "Juan (juan@foo)", + "Anthony (tony@foo)", + ] + # Note, too, that it's trusting the key because our save code puts the primary_key in the dictionary key, + # and if it's been edited by a human, we'll prefer that. + contributions.contributors_by_name = { + "Tony": tony, + "John": juan, + } + assert list(contributions.repo_contributor_names(with_email=True)) == [ + "Tony (tony@foo)", + "John (juan@foo)", + ] diff --git a/test/test_license_utils.py b/test/test_license_utils.py index ea7d7666f..670b5b0e5 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -11,7 +11,7 @@ LicenseFrameworkRegistry, LicenseFramework, PythonLicenseFramework, JavascriptLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, - logger as license_logger, + warnings as license_utils_warnings_module, ) from dcicutils.misc_utils import ignored, file_contents from dcicutils.qa_utils import printed_output, MockFileSystem @@ -326,7 +326,8 @@ def checker(printed, license_warnings): ' Bar License: library2, library8, libraryA, libraryB', ' Baz License: library4, library7, library9', ' Big-Org-Approved: libraryB', - ' Misc-Copyleft: libraryD'] + ' Misc-Copyleft: libraryD', + 'There is 1 no-longer-missing license: library1.'] javascript_failure = None for warning in license_warnings: @@ -342,7 +343,7 @@ def checker(printed, license_warnings): assert edited_license_warnings == [ "There is 1 unexpectedly missing license: library6.", - "There is 1 no-longer-missing license: library1.", + # "There is 1 no-longer-missing license: library1.", ] do_it() @@ -371,7 +372,7 @@ def check_license_checker_full_scenario_failing(*, perturb_setup, checker, file=fp) print(license_text_for_testing, file=fp) - with mock.patch.object(license_logger, 'warning') as mock_license_logger: + with mock.patch.object(license_utils_warnings_module, 'warn') as mock_license_logger: license_warnings = [] @@ -520,7 +521,7 @@ def test_license_checker_full_scenario_succeeding(): with mock.patch.object(PythonLicenseFramework, "get_dependencies") as mock_get_dependencies: - with mock.patch.object(license_logger, 'warning') as mock_license_logger: + with mock.patch.object(license_utils_warnings_module, 'warn') as mock_license_logger: license_warnings = [] @@ -655,7 +656,7 @@ def test_validate_simple_license_file(): with mfs.mock_exists_open_remove(): - with mock.patch.object(license_logger, 'warning') as mock_license_logger: + with mock.patch.object(license_utils_warnings_module, 'warn') as mock_license_logger: license_warnings = [] diff --git a/test/test_misc.py b/test/test_misc.py index f77253cc6..b790b8d6a 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -2,7 +2,7 @@ import pytest from dcicutils.license_utils import C4PythonInfrastructureLicenseChecker -from dcicutils.qa_checkers import DocsChecker, DebuggingArtifactChecker, ChangeLogChecker +from dcicutils.qa_checkers import DocsChecker, DebuggingArtifactChecker, ChangeLogChecker, ContributionsChecker from .conftest_settings import REPOSITORY_ROOT_DIR @@ -48,3 +48,8 @@ class MyChangeLogChecker(ChangeLogChecker): def test_license_compatibility(): C4PythonInfrastructureLicenseChecker.validate() + + +@pytest.mark.static +def test_contributions(): + ContributionsChecker.validate() diff --git a/test/test_qa_checkers.py b/test/test_qa_checkers.py index 9f0d0ca85..f1aa706c3 100644 --- a/test/test_qa_checkers.py +++ b/test/test_qa_checkers.py @@ -1,12 +1,18 @@ +import io +import json import os import pytest +from dcicutils import contribution_utils as contribution_utils_module +from dcicutils.contribution_utils import BasicContributions from dcicutils.misc_utils import lines_printed_to, remove_prefix from dcicutils.qa_checkers import ( DebuggingArtifactChecker, DocsChecker, ChangeLogChecker, VersionChecker, confirm_no_uses, find_uses, + ContributionsChecker ) from dcicutils.qa_utils import MockFileSystem, printed_output from unittest import mock +from .test_contribution_utils import git_context, SAMPLE_PROJECT_HOME from .conftest_settings import TEST_DIR @@ -252,7 +258,7 @@ def test_debugging_artifact_checker(): assert isinstance(exc.value, AssertionError) assert str(exc.value) == "1 problem detected:\n In foo/bar.py, 1 call to print." - assert printed.lines == [] # We might at some point print the actual problems, but we don't now. + assert printed.lines == [] # We might at some point print the actual problems, but we don't know. with lines_printed_to("foo/bar.py") as out: out('x = 1') @@ -262,4 +268,60 @@ def test_debugging_artifact_checker(): assert isinstance(exc.value, AssertionError) assert str(exc.value) == "1 problem detected:\n In foo/bar.py, 1 active use of pdb.set_trace." - assert printed.lines == [] # We might at some point print the actual problems, but we don't now. + assert printed.lines == [] # We might at some point print the actual problems, but we don't know. + + +@mock.patch.object(contribution_utils_module, "PROJECT_HOME", SAMPLE_PROJECT_HOME) +def test_contribution_checker(): + + print() # start on a fresh line + some_repo_name = 'foo' + mfs = MockFileSystem() + with mfs.mock_exists_open_remove_abspath_getcwd_chdir(): + # Predict which cache file we'll need, so we can make it ahead of time. + contributions_cache_file = BasicContributions(repo=some_repo_name).contributors_json_file() + print(f"contributions_cache_file={contributions_cache_file}") + os.chdir(os.path.join(SAMPLE_PROJECT_HOME, some_repo_name)) + print(f"working dir={os.getcwd()}") + with io.open(contributions_cache_file, 'w') as fp: + cache_data = { + "forked_at": "2015-01-01T12:34:56-05:00", + "excluded_fork": None, + "pre_fork_contributors_by_name": None, + "contributors_by_name": { + "John Smith": { + "emails": ["jsmith@somewhere"], + "names": ["John Smith"], + } + } + } + json.dump(cache_data, fp=fp) + mocked_commits = { + some_repo_name: [ + { + "hexsha": "aaaa", + "committed_datetime": "2016-01-01T01:23:45-05:00", + "author": {"name": "John Smith", "email": "jsmith@somewhere"}, + "message": "something" + }, + { + "hexsha": "bbbb", + "committed_datetime": "2017-01-02T12:34:56-05:00", + "author": {"name": "Sally", "email": "ssmith@elsewhere"}, + "message": "something else" + } + ] + } + with git_context(mocked_commits=mocked_commits): + with printed_output() as printed: + with pytest.raises(AssertionError) as exc: + ContributionsChecker.validate() + assert str(exc.value) == "There are contributor cache discrepancies." + assert printed.lines == [ + "John Smith (jsmith@somewhere)", + "Sally (ssmith@elsewhere)", + "===== THERE ARE CONTRIBUTOR CACHE DISCREPANCIES =====", + "To Add:", + " * contributors.Sally.emails.ssmith@elsewhere", + " * contributors.Sally.names.Sally", + ]