From 6318a61a2091e9939a85eafecd413fcaa4ee55e9 Mon Sep 17 00:00:00 2001 From: lihuanshuai Date: Tue, 26 Aug 2014 11:24:02 +0800 Subject: [PATCH 1/2] add git.multi-gc --- .gitignore | 1 + ellen/git/gc.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++ ellen/repo.py | 8 +++ tests/test_gc.py | 77 ++++++++++++++++++++++ 4 files changed, 251 insertions(+) create mode 100644 ellen/git/gc.py create mode 100644 tests/test_gc.py diff --git a/.gitignore b/.gitignore index 035e7d5..7feeed7 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ build tmp .ropeproject venv/ +.idea diff --git a/ellen/git/gc.py b/ellen/git/gc.py new file mode 100644 index 0000000..a32c44b --- /dev/null +++ b/ellen/git/gc.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +import os +import re +import sys +from functools import wraps +from ellen.utils.process import git_with_repo + +P_COMMIT = re.compile(r"^([0-9a-f]{40})\s+commit$") +P_OBJ = re.compile(r"^[0-9a-f]{38}$") + +AGGRESSIVE_WINDOW = 250 +AUTO_THRESHOLD = 6700 +AUTO_PACK_LIMIT = 50 +EXPIRE = '2.weeks.ago' +REPACK_ALL_OPTS = {'a': None, 'A': None, 'unpack_unreachable': None} + + +def check_status(f): + @wraps(f) + def wrapper(*a, **kw): + status = f(*a, **kw) + if status['returncode'] != 0: + raise RuntimeError("'%s' failed during git.multi_gc" % f.__name__) + return status + return wrapper + + +@check_status +def git_log(git, *a, **kw): + return git.log(*a, **kw) + + +@check_status +def git_pack_refs(git, *a, **kw): + return git.pack_refs(*a, **kw) + + +@check_status +def git_reflog(git, *a, **kw): + return git.reflog(*a, **kw) + + +@check_status +def git_repack(git, *a, **kw): + return git.repack(*a, **kw) + + +@check_status +def git_prune(git, *a, **kw): + return git.prune(*a, **kw) + + +@check_status +def git_rerere(git, *a, **kw): + return git.rerere(*a, **kw) + + +def _update_repack_all_options(expire=EXPIRE): + if "now" == expire: + REPACK_ALL_OPTS['a'] = True + elif expire: + REPACK_ALL_OPTS['A'] = True + REPACK_ALL_OPTS['unpack_unreachable'] = expire + + +def _too_many_loose_objects(repository): + obj_dir = os.path.join(repository.path, "objects/") + if AUTO_THRESHOLD <= 0: + return False + auto_thr = (AUTO_THRESHOLD + 255) // 256 + if not os.path.isdir(obj_dir): + return False + files = os.listdir(obj_dir) + root = obj_dir + if not files: + return False + for f in files: + path = os.path.join(root, f) + if os.path.isdir(path): + root = path + break + cnt = 0 + for f in os.listdir(root): + path = os.path.join(root, f) + if os.path.isfile(path) and P_OBJ.search(f): + cnt += 1 + if cnt > auto_thr: + return True + return False + + +def _too_many_packs(repository): + if AUTO_PACK_LIMIT <= 0: + return False + path = os.path.join(repository.path, "objects/info/packs") + if not os.path.isfile(path): + return False + with open(path, 'r') as f: + lines = f.readlines() + packs = len(lines) - 1 + if packs >= AUTO_PACK_LIMIT: + return True + return False + + +def need_to_gc(repository, expire=EXPIRE): + if AUTO_THRESHOLD <= 0: + return False + if _too_many_packs(repository): + _update_repack_all_options(expire=expire) + elif not _too_many_loose_objects(): + return False + return True + + +def gc_repository(repository, forks, auto=None, prune=None): + """git gc command + """ + expire = 'now' if prune == 'all' else prune + if not expire: + expire = EXPIRE + + try: + git = git_with_repo(repository) + status = {'returncode': 0, 'fullcmd': '%s multi-gc' % ' '.join(git.cmds), 'stderr': '', 'stdout': ''} + if prune: + prune_opt = "--prune=" + prune + status['fullcmd'] += ' ' + prune_opt + if not forks: + return git.gc(prune_opt, auto=auto) if prune else git.gc(auto=auto) + else: + paths = [ "--fork='%s'" % r.path for r in forks] + status['fullcmd'] += ' ' + ' '.join(paths) + + if auto: + status['fullcmd'] += ' --auto' + if not need_to_gc(repository, expire=expire): + return status + else: + _update_repack_all_options(expire=expire) + + git_pack_refs(git, all=True, prune=True) + git_reflog(git, 'expire', all=True) + git_repack(git, d=True, l=True, a=REPACK_ALL_OPTS['a'], + A=REPACK_ALL_OPTS['A'], + unpack_unreachable=REPACK_ALL_OPTS['unpack_unreachable']) + + # seek commits to be pruned + all_fork_commits = [] + commits = set() + for f in forks: + fork_git = git_with_repo(f) + all_fork_commits += git_log(fork_git, '--pretty=format:%H', all=True)['stdout'].splitlines() + for line in git_prune(git, dry_run=True, expire=expire)['stdout'].splitlines(): + matcher = P_COMMIT.search(line) + if matcher: + commits.add(matcher.group(1)) + commits &= set(all_fork_commits) + + git_prune(git, *commits, expire=expire) + git_rerere(git, 'gc') + except Exception as e: + print >>sys.stderr, e + status['returncode'] = -1 + return status diff --git a/ellen/repo.py b/ellen/repo.py index 6fc2040..7e0c0f8 100644 --- a/ellen/repo.py +++ b/ellen/repo.py @@ -12,6 +12,7 @@ from ellen.git.tag import list_tags, create_tag from ellen.git.commit import create_commit from ellen.git.diff import diff_wrapper as diff +from ellen.git.gc import gc_repository from ellen.git.ref import update_ref from ellen.git.clone import clone_repository, update_server_info from ellen.git.init import init_repository @@ -261,6 +262,13 @@ def create_tag(self, name, ref, author_name, author_email, message): def update_hooks(self, path): return update_hooks(self.repository, path) + def gc(self, fork_paths=None, auto=None, prune=None): + forks = [] + if isinstance(fork_paths, (list, tuple)): + for p in fork_paths: + forks.append(repository(p)) + return gc_repository(self.repository, forks, auto=auto, prune=prune) + def repository(path): try: diff --git a/tests/test_gc.py b/tests/test_gc.py new file mode 100644 index 0000000..5efd344 --- /dev/null +++ b/tests/test_gc.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +from pygit2 import Repository +from pygit2 import is_repository +from _base import BareRepoTest +from ellen.repo import Jagare + +class test_gc(BareRepoTest): + + def test_simple(self): + repo = Jagare(self.path) + pygit2_repo = Repository(self.path) + ret = repo.gc() + self.assertEqual(ret['returncode'], 0) + self.assertTrue('gc' in ret['fullcmd']) + self.assertFalse(pygit2_repo.is_empty) + self.assertTrue(pygit2_repo.is_bare) + self.assertFalse(repo.empty) + self.assertTrue(repo.bare) + + def test_simple_auto(self): + repo = Jagare(self.path) + pygit2_repo = Repository(self.path) + ret = repo.gc(auto=True) + self.assertEqual(ret['returncode'], 0) + self.assertTrue('gc' in ret['fullcmd']) + self.assertFalse(pygit2_repo.is_empty) + self.assertTrue(pygit2_repo.is_bare) + self.assertFalse(repo.empty) + self.assertTrue(repo.bare) + + def test_simple_all(self): + repo = Jagare(self.path) + pygit2_repo = Repository(self.path) + ret = repo.gc(prune='all') + self.assertEqual(ret['returncode'], 0) + self.assertTrue('gc' in ret['fullcmd']) + self.assertFalse(pygit2_repo.is_empty) + self.assertTrue(pygit2_repo.is_bare) + self.assertFalse(repo.empty) + self.assertTrue(repo.bare) + + def test_multi(self): + repo = Jagare(self.path) + path = self.get_temp_path() + clone_repo = repo.clone(path, shared=True) + pygit2_repo = Repository(path) + ret = repo.gc(fork_paths=path) + self.assertTrue(is_repository(path)) + self.assertFalse(pygit2_repo.is_empty) + self.assertFalse(pygit2_repo.is_bare) + self.assertFalse(clone_repo.empty) + self.assertFalse(clone_repo.bare) + + def test_multi_auto(self): + repo = Jagare(self.path) + path = self.get_temp_path() + clone_repo = repo.clone(path, shared=True) + pygit2_repo = Repository(path) + ret = repo.gc(fork_paths=path, auto=True) + self.assertTrue(is_repository(path)) + self.assertFalse(pygit2_repo.is_empty) + self.assertFalse(pygit2_repo.is_bare) + self.assertFalse(clone_repo.empty) + self.assertFalse(clone_repo.bare) + + def test_multi_all(self): + repo = Jagare(self.path) + path = self.get_temp_path() + clone_repo = repo.clone(path, shared=True) + pygit2_repo = Repository(path) + ret = repo.gc(fork_paths=path, auto=True) + self.assertTrue(is_repository(path)) + self.assertFalse(pygit2_repo.is_empty) + self.assertFalse(pygit2_repo.is_bare) + self.assertFalse(clone_repo.empty) + self.assertFalse(clone_repo.bare) From b7ebcd674257550541af713410e4418bc20f6804 Mon Sep 17 00:00:00 2001 From: lihuanshuai Date: Sat, 6 Sep 2014 22:10:39 +0800 Subject: [PATCH 2/2] improve prune --- ellen/git/gc.py | 118 +++++++++++++++++++++++------------------------ tests/test_gc.py | 54 ++++++++++++++++++++++ 2 files changed, 113 insertions(+), 59 deletions(-) diff --git a/ellen/git/gc.py b/ellen/git/gc.py index a32c44b..bfbea07 100644 --- a/ellen/git/gc.py +++ b/ellen/git/gc.py @@ -5,62 +5,36 @@ from functools import wraps from ellen.utils.process import git_with_repo -P_COMMIT = re.compile(r"^([0-9a-f]{40})\s+commit$") P_OBJ = re.compile(r"^[0-9a-f]{38}$") AGGRESSIVE_WINDOW = 250 AUTO_THRESHOLD = 6700 AUTO_PACK_LIMIT = 50 EXPIRE = '2.weeks.ago' -REPACK_ALL_OPTS = {'a': None, 'A': None, 'unpack_unreachable': None} +_OPTS = {'repack_all': {}} def check_status(f): @wraps(f) def wrapper(*a, **kw): + fn = a[0] status = f(*a, **kw) if status['returncode'] != 0: - raise RuntimeError("'%s' failed during git.multi_gc" % f.__name__) + raise RuntimeError("'%s' failed during git.multi_gc" % fn.__name__) return status return wrapper @check_status -def git_log(git, *a, **kw): - return git.log(*a, **kw) - - -@check_status -def git_pack_refs(git, *a, **kw): - return git.pack_refs(*a, **kw) - - -@check_status -def git_reflog(git, *a, **kw): - return git.reflog(*a, **kw) - - -@check_status -def git_repack(git, *a, **kw): - return git.repack(*a, **kw) - - -@check_status -def git_prune(git, *a, **kw): - return git.prune(*a, **kw) - - -@check_status -def git_rerere(git, *a, **kw): - return git.rerere(*a, **kw) +def git_process(fn, *a, **kw): + return fn(*a, **kw) def _update_repack_all_options(expire=EXPIRE): - if "now" == expire: - REPACK_ALL_OPTS['a'] = True - elif expire: - REPACK_ALL_OPTS['A'] = True - REPACK_ALL_OPTS['unpack_unreachable'] = expire + a = True if "now" == expire else None + A = True if expire else None + unpack_unreachable = expire if expire else None + _OPTS['repack_all'] = dict(a=a, A=A, unpack_unreachable=unpack_unreachable) def _too_many_loose_objects(repository): @@ -94,7 +68,7 @@ def _too_many_packs(repository): return False path = os.path.join(repository.path, "objects/info/packs") if not os.path.isfile(path): - return False + return False with open(path, 'r') as f: lines = f.readlines() packs = len(lines) - 1 @@ -113,6 +87,37 @@ def need_to_gc(repository, expire=EXPIRE): return True +class BfsQue(object): + def __init__(self, wanted, cnd_fn=lambda x, s: x in s): + self.data = [] + self.visited = [] + self.wanted = wanted + self.cnd = cnd_fn + + def _visit(self, item): + addq = lambda q, x: q.append(x) + if self.cnd(item, self.wanted) and item not in self.data: + addq(self.data, item) + return True + addq(self.visited, item) + return False + + def search(self, item): + empty = lambda x: len(x) == 0 + addq = lambda q, x: q.append(x) + delq = lambda x: x.pop(0) + avail = [] + if self._visit(item): + return + addq(avail, item) + while not empty(avail): + c = delq(avail) + for p in c.parents: + if p in self.visited or self._visit(p): + continue + addq(avail, p) + + def gc_repository(repository, forks, auto=None, prune=None): """git gc command """ @@ -120,14 +125,16 @@ def gc_repository(repository, forks, auto=None, prune=None): if not expire: expire = EXPIRE + git = git_with_repo(repository) + status = {'returncode': 0, 'fullcmd': '%s multi-gc' % ' '.join(git.cmds), 'stderr': '', 'stdout': ''} try: - git = git_with_repo(repository) - status = {'returncode': 0, 'fullcmd': '%s multi-gc' % ' '.join(git.cmds), 'stderr': '', 'stdout': ''} + prune_opts = [] if prune: - prune_opt = "--prune=" + prune - status['fullcmd'] += ' ' + prune_opt + prune_opts.append("--prune=" + prune) + status['fullcmd'] += ' ' + prune_opts[0] + if not forks: - return git.gc(prune_opt, auto=auto) if prune else git.gc(auto=auto) + return git.gc(*prune_opts, auto=auto) else: paths = [ "--fork='%s'" % r.path for r in forks] status['fullcmd'] += ' ' + ' '.join(paths) @@ -139,26 +146,19 @@ def gc_repository(repository, forks, auto=None, prune=None): else: _update_repack_all_options(expire=expire) - git_pack_refs(git, all=True, prune=True) - git_reflog(git, 'expire', all=True) - git_repack(git, d=True, l=True, a=REPACK_ALL_OPTS['a'], - A=REPACK_ALL_OPTS['A'], - unpack_unreachable=REPACK_ALL_OPTS['unpack_unreachable']) + git_process(git.pack_refs, all=True, prune=True) + git_process(git.reflog, 'expire', all=True) + git_process(git.repack, d=True, l=True, **_OPTS['repack_all']) - # seek commits to be pruned - all_fork_commits = [] - commits = set() + que = BfsQue(repository, cnd_fn=lambda commit, repo: commit.id in repo) for f in forks: - fork_git = git_with_repo(f) - all_fork_commits += git_log(fork_git, '--pretty=format:%H', all=True)['stdout'].splitlines() - for line in git_prune(git, dry_run=True, expire=expire)['stdout'].splitlines(): - matcher = P_COMMIT.search(line) - if matcher: - commits.add(matcher.group(1)) - commits &= set(all_fork_commits) - - git_prune(git, *commits, expire=expire) - git_rerere(git, 'gc') + refs = f.listall_references() + for ref in refs: + ref_commit = f.lookup_reference(ref).get_object() + que.search(ref_commit) + commits = [str(c.id) for c in que.data] + git_process(git.prune, *commits, expire=expire) + git_process(git.rerere, 'gc') except Exception as e: print >>sys.stderr, e status['returncode'] = -1 diff --git a/tests/test_gc.py b/tests/test_gc.py index 5efd344..2f316b9 100644 --- a/tests/test_gc.py +++ b/tests/test_gc.py @@ -1,9 +1,11 @@ # -*- coding: utf-8 -*- +import unittest from pygit2 import Repository from pygit2 import is_repository from _base import BareRepoTest from ellen.repo import Jagare +from ellen.git.gc import BfsQue class test_gc(BareRepoTest): @@ -75,3 +77,55 @@ def test_multi_all(self): self.assertFalse(pygit2_repo.is_bare) self.assertFalse(clone_repo.empty) self.assertFalse(clone_repo.bare) + + +class Node(object): + def __init__(self, id, parents): + self.id = id + self.parents = parents + + +class test_que(unittest.TestCase): + def setUp(self): + self.node7 = Node(7, []) + self.node5, self.node6 = Node(5, [self.node7]), Node(6, [self.node7]) + self.node3, self.node4 = Node(3, [self.node5]), Node(4, [self.node6]) + self.node2 = Node(2, [self.node3, self.node4]) + self.node1 = Node(1, [self.node2]) + self.first_node = Node(0, [self.node1]) + self.cnd_fn = lambda item, wanted: item in wanted + + def test_basic1(self): + self.que = BfsQue([self.first_node], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.first_node]) + + def test_basic2(self): + self.que = BfsQue([self.node1], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.node1]) + + def test_basic3(self): + self.que = BfsQue([self.node3, self.node4], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.node3, self.node4]) + + def test_basic4(self): + self.que = BfsQue([self.node5, self.node6], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.node5, self.node6]) + + def test_basic5(self): + self.que = BfsQue([self.node7], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.node7]) + + def test_neg1(self): + self.que = BfsQue([self.node3], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.node3]) + + def test_neg2(self): + self.que = BfsQue([self.node5], cnd_fn=self.cnd_fn) + self.que.search(self.first_node) + self.assertEqual(self.que.data, [self.node5]) \ No newline at end of file