Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Blame via pygit2 instead of subprocess #127

Merged
merged 2 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions git_deps/blame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import subprocess
import re
from dataclasses import dataclass

# The following classes are introduced to imitate their counterparts in pygit2,
# so that the output of 'blame_via_subprocess' can be swapped with pygit2's own
# blame output.

@dataclass
class GitRef:
"""
A reference to a commit
"""
hex: str

@dataclass
class BlameHunk:
"""
A chunk of a blame output which has the same commit information
for a consecutive set of lines
"""
orig_commit_id: GitRef
orig_start_line_number: int
final_start_line_number: int
lines_in_hunk: int = 1


def blame_via_subprocess(path, commit, start_line, num_lines):
"""
Generate a list of blame hunks by calling 'git blame' as a separate process.
This is a workaround for the slowness of pygit2's own blame algorithm.
See https://github.com/aspiers/git-deps/issues/1
"""
cmd = [
'git', 'blame',
'--porcelain',
'-L', "%d,+%d" % (start_line, num_lines),
commit, '--', path
]
output = subprocess.check_output(cmd, universal_newlines=True)

current_hunk = None
for line in output.split('\n'):
m = re.match(r'^([0-9a-f]{40}) (\d+) (\d+) (\d+)$', line)

if m: # starting a new hunk
if current_hunk:
yield current_hunk
dependency_sha1, orig_line_num, line_num, length = m.group(1, 2, 3, 4)
orig_line_num = int(orig_line_num)
line_num = int(line_num)
length = int(length)
current_hunk = BlameHunk(
orig_commit_id=GitRef(dependency_sha1),
orig_start_line_number = orig_line_num,
final_start_line_number = line_num,
lines_in_hunk = length
)

if current_hunk:
yield current_hunk
2 changes: 2 additions & 0 deletions git_deps/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def parse_args():
'[%(default)s]')
parser.add_argument('-d', '--debug', dest='debug', action='store_true',
help='Show debugging')
parser.add_argument('--pygit2-blame', dest='pygit2_blame', action='store_true',
help="Use pygit2's blame algorithm (slower than git's)")

options, args = parser.parse_known_args()

Expand Down
46 changes: 26 additions & 20 deletions git_deps/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from git_deps.gitutils import GitUtils
from git_deps.listener.base import DependencyListener
from git_deps.errors import InvalidCommitish
from git_deps.blame import blame_via_subprocess


class DependencyDetector(object):
Expand Down Expand Up @@ -172,24 +173,26 @@ def blame_diff_hunk(self, dependent, parent, path, hunk):

line_to_culprit = {}

for line in blame.split('\n'):
self.process_hunk_line(dependent, dependent_sha1, parent,
path, line, line_to_culprit)
for blame_hunk in blame:
self.process_blame_hunk(dependent, dependent_sha1, parent,
path, blame_hunk, line_to_culprit)

self.debug_hunk(line_range_before, line_range_after, hunk,
line_to_culprit)

def process_hunk_line(self, dependent, dependent_sha1, parent,
path, line, line_to_culprit):
self.logger.debug(" ! " + line.rstrip())
m = re.match(r'^([0-9a-f]{40}) (\d+) (\d+)( \d+)?$', line)
if not m:
return
def process_blame_hunk(self, dependent, dependent_sha1, parent,
path, blame_hunk, line_to_culprit):

orig_line_num = blame_hunk.orig_start_line_number
line_num = blame_hunk.final_start_line_number
dependency_sha1 = blame_hunk.orig_commit_id.hex
line_representation = f"{dependency_sha1} {orig_line_num} {line_num}"

self.logger.debug(f" ! {line_representation}")

dependency_sha1, orig_line_num, line_num = m.group(1, 2, 3)
line_num = int(line_num)
dependency = self.get_commit(dependency_sha1)
line_to_culprit[line_num] = dependency.hex
for i in range(blame_hunk.lines_in_hunk):
line_to_culprit[line_num + i] = dependency.hex

if self.is_excluded(dependency):
self.logger.debug(
Expand All @@ -206,7 +209,7 @@ def process_hunk_line(self, dependent, dependent_sha1, parent,
self.record_dependency_source(parent,
dependent, dependent_sha1,
dependency, dependency_sha1,
path, line_num, line)
path, line_num, line_representation)

def debug_hunk(self, line_range_before, line_range_after, hunk,
line_to_culprit):
Expand Down Expand Up @@ -234,13 +237,16 @@ def register_new_dependent(self, dependent, dependent_sha1):
self.notify_listeners("new_dependent", dependent)

def run_blame(self, hunk, parent, path):
cmd = [
'git', 'blame',
'--porcelain',
'-L', "%d,+%d" % (hunk.old_start, hunk.old_lines),
parent.hex, '--', path
]
return subprocess.check_output(cmd, universal_newlines=True)
if self.options.pygit2_blame:
return self.repo.blame(path,
newest_commit=parent.hex,
min_line=hunk.old_start,
max_line=hunk.old_start + hunk.old_lines - 1)
else:
return blame_via_subprocess(path,
parent.hex,
hunk.old_start,
hunk.old_lines)

def is_excluded(self, commit):
if self.options.exclude_commits is not None:
Expand Down
9 changes: 9 additions & 0 deletions tests/self_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,21 @@ echo "Running test suite"
echo "* Dependencies of 4f27a1e, a regular commit"
git-deps 4f27a1e^! | sort | diff tests/expected_outputs/deps_4f27a1e -

echo "* Same, but via pygit2's blame algorithm"
git-deps --pygit2-blame 4f27a1e^! | sort | diff tests/expected_outputs/deps_4f27a1e -

echo "* Dependencies of 1ba7ad5, a merge commit"
git-deps 1ba7ad5^! | sort | diff tests/expected_outputs/deps_1ba7ad5 -

echo "* Same, but via pygit2's blame algorithm"
git-deps --pygit2-blame 1ba7ad5^! | sort | diff tests/expected_outputs/deps_1ba7ad5 -

echo "* Dependencies of the root commit"
git-deps b196757^! | sort | diff tests/expected_outputs/deps_b196757 -

echo "* Same, but via pygit2's blame algorithm"
git-deps --pygit2-blame b196757^! | sort | diff tests/expected_outputs/deps_b196757 -

echo "* Recursive dependencies of a4f27a1e, a regular commit"
git-deps -r 4f27a1e^! | sort | diff tests/expected_outputs/recursive_deps_4f27a1e -

Expand Down
24 changes: 24 additions & 0 deletions tests/test_blame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

from git_deps.blame import blame_via_subprocess, BlameHunk, GitRef

def test_blame_via_subprocess():
hunks = list(blame_via_subprocess(
'INSTALL.md',
'04f5c095d4eccf5808db6dbf90c31a535f7f371c',
12, 4))

expected_hunks = [
BlameHunk(
GitRef('6e23a48f888a355ad7e101c797ce1b66c4b7b86a'),
orig_start_line_number=12,
final_start_line_number=12,
lines_in_hunk=2),
BlameHunk(
GitRef('2c9d23b0291157eb1096384ff76e0122747b9bdf'),
orig_start_line_number=10,
final_start_line_number=14,
lines_in_hunk=2)
]

assert hunks == expected_hunks