Skip to content

Commit

Permalink
Git fetcher: Improve submodule handling
Browse files Browse the repository at this point in the history
Instead of making a complete copy of the repo, fetching the
submodules, and writing the result to the store (which is all
superexpensive), we now fetch the submodules recursively using the Git
fetcher, and return a union accessor that "mounts" the accessors for
the submodules on top of the root accessor.
  • Loading branch information
edolstra committed Oct 31, 2023
1 parent ee36a44 commit d88106d
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 82 deletions.
78 changes: 78 additions & 0 deletions src/libfetchers/git-utils.cc
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#include "git-utils.hh"
#include "input-accessor.hh"
#include "cache.hh"
#include "finally.hh"

#include <boost/core/span.hpp>

#include <git2/blob.h>
#include <git2/commit.h>
#include <git2/config.h>
#include <git2/describe.h>
#include <git2/errors.h>
#include <git2/global.h>
Expand All @@ -14,6 +16,7 @@
#include <git2/remote.h>
#include <git2/repository.h>
#include <git2/status.h>
#include <git2/submodule.h>
#include <git2/tree.h>

#include <unordered_set>
Expand Down Expand Up @@ -63,6 +66,8 @@ typedef std::unique_ptr<git_reference, Deleter<git_reference_free>> Reference;
typedef std::unique_ptr<git_describe_result, Deleter<git_describe_result_free>> DescribeResult;
typedef std::unique_ptr<git_status_list, Deleter<git_status_list_free>> StatusList;
typedef std::unique_ptr<git_remote, Deleter<git_remote_free>> Remote;
typedef std::unique_ptr<git_config, Deleter<git_config_free>> GitConfig;
typedef std::unique_ptr<git_config_iterator, Deleter<git_config_iterator_free>> ConfigIterator;

// A helper to ensure that we don't leak objects returned by libgit2.
template<typename T>
Expand Down Expand Up @@ -256,6 +261,17 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
return std::nullopt;
}

std::vector<Submodule> getSubmodules(const Hash & rev) override;

std::string resolveSubmoduleUrl(const std::string & url) override
{
git_buf buf = GIT_BUF_INIT;
if (git_submodule_resolve_url(&buf, *this, url.c_str()))
throw Error("resolving Git submodule URL '%s'", url);
Finally cleanup = [&]() { git_buf_dispose(&buf); };
return buf.ptr;
}

bool hasObject(const Hash & oid_) override
{
auto oid = hashToOID(oid_);
Expand Down Expand Up @@ -400,6 +416,16 @@ struct GitInputAccessor : InputAccessor
return readBlob(path, true);
}

Hash getSubmoduleRev(const CanonPath & path)
{
auto entry = need(path);

if (git_tree_entry_type(entry) != GIT_OBJECT_COMMIT)
throw Error("'%s' is not a submodule", showPath(path));

return toHash(*git_tree_entry_id(entry));
}

std::map<CanonPath, TreeEntry> lookupCache;

/* Recursively look up 'path' relative to the root. */
Expand Down Expand Up @@ -495,4 +521,56 @@ ref<InputAccessor> GitRepoImpl::getAccessor(const Hash & rev)
return make_ref<GitInputAccessor>(ref<GitRepoImpl>(shared_from_this()), rev);
}

std::vector<GitRepoImpl::Submodule> GitRepoImpl::getSubmodules(const Hash & rev)
{
/* Read the .gitmodules files from this revision. */
CanonPath modulesFile(".gitmodules");

auto accessor = getAccessor(rev);
if (!accessor->pathExists(modulesFile)) return {};

/* Parse it. */
auto configS = accessor->readFile(modulesFile);

auto [fdTemp, pathTemp] = createTempFile("nix-git-submodules");
writeFull(fdTemp.get(), configS);

GitConfig config;
if (git_config_open_ondisk(Setter(config), pathTemp.c_str()))
throw Error("parsing .gitmodules file: %s", git_error_last()->message);

ConfigIterator it;
if (git_config_iterator_glob_new(Setter(it), config.get(), "^submodule\\..*\\.(path|url|branch)$"))
throw Error("iterating over .gitmodules: %s", git_error_last()->message);

std::map<std::string, std::string> entries;

while (true) {
git_config_entry * entry = nullptr;
if (auto err = git_config_next(&entry, it.get())) {
if (err == GIT_ITEROVER) break;
throw Error("iterating over .gitmodules: %s", git_error_last()->message);
}
entries.emplace(entry->name + 10, entry->value);
}

std::vector<Submodule> result;

for (auto & [key, value] : entries) {
if (!hasSuffix(key, ".path")) continue;
std::string key2(key, 0, key.size() - 5);
auto path = CanonPath(value);
auto rev = accessor.dynamic_pointer_cast<GitInputAccessor>()->getSubmoduleRev(path);
result.push_back(Submodule {
.path = path,
.url = entries[key2 + ".url"],
.branch = entries[key2 + ".branch"],
.rev = rev,
});
}

return result;
}


}
12 changes: 12 additions & 0 deletions src/libfetchers/git-utils.hh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ struct GitRepo
/* Get the ref that HEAD points to. */
virtual std::optional<std::string> getWorkdirRef() = 0;

struct Submodule
{
CanonPath path;
std::string url;
std::string branch;
Hash rev;
};

virtual std::vector<Submodule> getSubmodules(const Hash & rev) = 0;

virtual std::string resolveSubmoduleUrl(const std::string & url) = 0;

struct TarballInfo
{
Hash treeHash;
Expand Down
105 changes: 31 additions & 74 deletions src/libfetchers/git.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "util.hh"
#include "git.hh"
#include "fs-input-accessor.hh"
#include "union-input-accessor.hh"
#include "git-utils.hh"

#include "fetch-settings.hh"
Expand Down Expand Up @@ -134,11 +135,6 @@ std::optional<std::string> readHeadCached(const std::string & actualUrl)
return std::nullopt;
}

bool isNotDotGitDirectory(const Path & path)
{
return baseNameOf(path) != ".git";
}

} // end namespace

struct GitInputScheme : InputScheme
Expand Down Expand Up @@ -413,7 +409,7 @@ struct GitInputScheme : InputScheme

std::string name = input.getName();

auto makeResult2 = [&](const Attrs & infoAttrs, ref<InputAccessor> accessor) -> std::pair<ref<InputAccessor>, Input>
auto makeResult = [&](const Attrs & infoAttrs, ref<InputAccessor> accessor) -> std::pair<ref<InputAccessor>, Input>
{
assert(input.getRev());
assert(!origRev || origRev == input.getRev());
Expand All @@ -424,18 +420,6 @@ struct GitInputScheme : InputScheme
return {accessor, std::move(input)};
};

auto makeResult = [&](const Attrs & infoAttrs, const StorePath & storePath) -> std::pair<ref<InputAccessor>, Input>
{
// FIXME: remove?
//input.attrs.erase("narHash");
auto narHash = store->queryPathInfo(storePath)->narHash;
input.attrs.insert_or_assign("narHash", narHash.to_string(HashFormat::SRI, true));

auto accessor = makeStorePathAccessor(store, storePath, makeNotAllowedError(repoInfo.url));

return makeResult2(infoAttrs, accessor);
};

auto originalRef = input.getRef();
auto ref = originalRef ? *originalRef : getDefaultRef(repoInfo);
input.attrs.insert_or_assign("ref", ref);
Expand Down Expand Up @@ -542,66 +526,39 @@ struct GitInputScheme : InputScheme

printTalkative("using revision %s of repo '%s'", rev.gitRev(), repoInfo.url);

if (!repoInfo.submodules) {
auto accessor = GitRepo::openRepo(CanonPath(repoDir))->getAccessor(rev);
return makeResult2(infoAttrs, accessor);
}

else {
// FIXME: use libgit2
Path tmpDir = createTempDir();
AutoDelete delTmpDir(tmpDir, true);
PathFilter filter = defaultPathFilter;

Activity act(*logger, lvlChatty, actUnknown, fmt("copying Git tree '%s' to the store", input.to_string()));

Path tmpGitDir = createTempDir();
AutoDelete delTmpGitDir(tmpGitDir, true);

runProgram("git", true, { "-c", "init.defaultBranch=" + gitInitialBranch, "init", tmpDir, "--separate-git-dir", tmpGitDir });

{
// TODO: repoDir might lack the ref (it only checks if rev
// exists, see FIXME above) so use a big hammer and fetch
// everything to ensure we get the rev.
Activity act(*logger, lvlTalkative, actUnknown, fmt("making temporary clone of '%s'", repoDir));
runProgram("git", true, { "-C", tmpDir, "fetch", "--quiet", "--force",
"--update-head-ok", "--", repoDir, "refs/*:refs/*" }, {}, true);
auto repo = GitRepo::openRepo(CanonPath(repoDir));

auto accessor = repo->getAccessor(rev);

/* If the repo has submodules, fetch them and return a union
input accessor consisting of the accessor for the top-level
repo and the accessors for the submodules. */
if (repoInfo.submodules) {
std::map<CanonPath, nix::ref<InputAccessor>> mounts;

for (auto & submodule : repo->getSubmodules(rev)) {
auto resolved = repo->resolveSubmoduleUrl(submodule.url);
debug("Git submodule %s: %s %s %s -> %s",
submodule.path, submodule.url, submodule.branch, submodule.rev.gitRev(), resolved);
fetchers::Attrs attrs;
attrs.insert_or_assign("type", "git");
attrs.insert_or_assign("url", resolved);
if (submodule.branch != "")
attrs.insert_or_assign("ref", submodule.branch);
attrs.insert_or_assign("rev", submodule.rev.gitRev());
auto submoduleInput = fetchers::Input::fromAttrs(std::move(attrs));
auto [submoduleAccessor, submoduleInput2] =
submoduleInput.scheme->getAccessor(store, submoduleInput);
mounts.insert_or_assign(submodule.path, submoduleAccessor);
}

runProgram("git", true, { "-C", tmpDir, "checkout", "--quiet", rev.gitRev() });

/* Ensure that we use the correct origin for fetching
submodules. This matters for submodules with relative
URLs. */
if (repoInfo.isLocal) {
writeFile(tmpGitDir + "/config", readFile(repoDir + "/" + repoInfo.gitDir + "/config"));

/* Restore the config.bare setting we may have just
copied erroneously from the user's repo. */
runProgram("git", true, { "-C", tmpDir, "config", "core.bare", "false" });
} else
runProgram("git", true, { "-C", tmpDir, "config", "remote.origin.url", repoInfo.url });

/* As an optimisation, copy the modules directory of the
source repo if it exists. */
auto modulesPath = repoDir + "/" + repoInfo.gitDir + "/modules";
if (pathExists(modulesPath)) {
Activity act(*logger, lvlTalkative, actUnknown, fmt("copying submodules of '%s'", repoInfo.url));
runProgram("cp", true, { "-R", "--", modulesPath, tmpGitDir + "/modules" });
if (!mounts.empty()) {
mounts.insert_or_assign(CanonPath::root, accessor);
accessor = makeUnionInputAccessor(std::move(mounts));
}

{
Activity act(*logger, lvlTalkative, actUnknown, fmt("fetching submodules of '%s'", repoInfo.url));
runProgram("git", true, { "-C", tmpDir, "submodule", "--quiet", "update", "--init", "--recursive" }, {}, true);
}

filter = isNotDotGitDirectory;

auto storePath = store->addToStore(name, tmpDir, FileIngestionMethod::Recursive, htSHA256, filter);

return makeResult(infoAttrs, std::move(storePath));
}

return makeResult(infoAttrs, accessor);
}

std::pair<ref<InputAccessor>, Input> getAccessorFromWorkdir(
Expand Down
80 changes: 80 additions & 0 deletions src/libfetchers/union-input-accessor.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include "union-input-accessor.hh"

namespace nix {

struct UnionInputAccessor : InputAccessor
{
std::map<CanonPath, ref<InputAccessor>> mounts;

UnionInputAccessor(std::map<CanonPath, ref<InputAccessor>> _mounts)
: mounts(std::move(_mounts))
{
// Currently we require a root filesystem. This could be relaxed.
assert(mounts.contains(CanonPath::root));

// FIXME: should check that every mount point exists. Or we
// could return dummy parent directories automatically.
}

std::string readFile(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->readFile(subpath);
}

bool pathExists(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->pathExists(subpath);
}

Stat lstat(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->lstat(subpath);
}

DirEntries readDirectory(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->readDirectory(subpath);
}

std::string readLink(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->readLink(subpath);
}

std::string showPath(const CanonPath & path) override
{
auto [accessor, subpath] = resolve(path);
return accessor->showPath(subpath);
}

std::pair<ref<InputAccessor>, CanonPath> resolve(CanonPath path)
{
// Find the nearest parent of `path` that is a mount point.
std::vector<std::string> ss;
while (true) {
auto i = mounts.find(path);
if (i != mounts.end()) {
auto subpath = CanonPath::root;
for (auto j = ss.rbegin(); j != ss.rend(); ++j)
subpath.push(*j);
return {i->second, std::move(subpath)};
}

assert(!path.isRoot());
ss.push_back(std::string(*path.baseName()));
path.pop();
}
}
};

ref<InputAccessor> makeUnionInputAccessor(std::map<CanonPath, ref<InputAccessor>> mounts)
{
return make_ref<UnionInputAccessor>(std::move(mounts));
}

}
9 changes: 9 additions & 0 deletions src/libfetchers/union-input-accessor.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include "input-accessor.hh"

namespace nix {

ref<InputAccessor> makeUnionInputAccessor(std::map<CanonPath, ref<InputAccessor>> mounts);

}
8 changes: 0 additions & 8 deletions tests/functional/fetchGitSubmodules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,3 @@ cloneRepo=$TEST_ROOT/a/b/gitSubmodulesClone # NB /a/b to make the relative path
git clone $rootRepo $cloneRepo
pathIndirect=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath")
[[ $pathIndirect = $pathWithRelative ]]

# Test that if the clone has the submodule already, we're not fetching
# it again.
git -C $cloneRepo submodule update --init
rm $TEST_HOME/.cache/nix/fetcher-cache*
rm -rf $subRepo
pathSubmoduleGone=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath")
[[ $pathSubmoduleGone = $pathWithRelative ]]

0 comments on commit d88106d

Please sign in to comment.