git/builtin/commit-graph.c

217 lines
5.5 KiB
C
Raw Normal View History

#include "builtin.h"
#include "config.h"
#include "dir.h"
#include "lockfile.h"
#include "parse-options.h"
#include "repository.h"
#include "commit-graph.h"
static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph [--object-dir <objdir>]"),
N_("git commit-graph read [--object-dir <objdir>]"),
N_("git commit-graph verify [--object-dir <objdir>]"),
N_("git commit-graph write [--object-dir <objdir>] [--append] [--reachable|--stdin-packs|--stdin-commits]"),
NULL
};
static const char * const builtin_commit_graph_verify_usage[] = {
N_("git commit-graph verify [--object-dir <objdir>]"),
NULL
};
static const char * const builtin_commit_graph_read_usage[] = {
N_("git commit-graph read [--object-dir <objdir>]"),
NULL
};
static const char * const builtin_commit_graph_write_usage[] = {
N_("git commit-graph write [--object-dir <objdir>] [--append] [--reachable|--stdin-packs|--stdin-commits]"),
NULL
};
static struct opts_commit_graph {
const char *obj_dir;
int reachable;
int stdin_packs;
int stdin_commits;
int append;
} opts;
static int graph_verify(int argc, const char **argv)
{
struct commit_graph *graph = NULL;
char *graph_name;
static struct option builtin_commit_graph_verify_options[] = {
OPT_STRING(0, "object-dir", &opts.obj_dir,
N_("dir"),
N_("The object directory to store the graph")),
OPT_END(),
};
argc = parse_options(argc, argv, NULL,
builtin_commit_graph_verify_options,
builtin_commit_graph_verify_usage, 0);
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
graph_name = get_commit_graph_filename(opts.obj_dir);
graph = load_commit_graph_one(graph_name);
FREE_AND_NULL(graph_name);
if (!graph)
return 0;
UNLEAK(graph);
return verify_commit_graph(the_repository, graph);
}
static int graph_read(int argc, const char **argv)
{
struct commit_graph *graph = NULL;
char *graph_name;
static struct option builtin_commit_graph_read_options[] = {
OPT_STRING(0, "object-dir", &opts.obj_dir,
N_("dir"),
N_("The object directory to store the graph")),
OPT_END(),
};
argc = parse_options(argc, argv, NULL,
builtin_commit_graph_read_options,
builtin_commit_graph_read_usage, 0);
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
graph_name = get_commit_graph_filename(opts.obj_dir);
graph = load_commit_graph_one(graph_name);
if (!graph)
die("graph file %s does not exist", graph_name);
FREE_AND_NULL(graph_name);
printf("header: %08x %d %d %d %d\n",
ntohl(*(uint32_t*)graph->data),
*(unsigned char*)(graph->data + 4),
*(unsigned char*)(graph->data + 5),
*(unsigned char*)(graph->data + 6),
*(unsigned char*)(graph->data + 7));
printf("num_commits: %u\n", graph->num_commits);
printf("chunks:");
if (graph->chunk_oid_fanout)
printf(" oid_fanout");
if (graph->chunk_oid_lookup)
printf(" oid_lookup");
if (graph->chunk_commit_data)
printf(" commit_metadata");
if (graph->chunk_large_edges)
printf(" large_edges");
printf("\n");
UNLEAK(graph);
return 0;
}
extern int read_replace_refs;
static int graph_write(int argc, const char **argv)
{
struct string_list *pack_indexes = NULL;
struct string_list *commit_hex = NULL;
struct string_list lines;
static struct option builtin_commit_graph_write_options[] = {
OPT_STRING(0, "object-dir", &opts.obj_dir,
N_("dir"),
N_("The object directory to store the graph")),
OPT_BOOL(0, "reachable", &opts.reachable,
N_("start walk at all refs")),
OPT_BOOL(0, "stdin-packs", &opts.stdin_packs,
N_("scan pack-indexes listed by stdin for commits")),
OPT_BOOL(0, "stdin-commits", &opts.stdin_commits,
N_("start walk at commits listed by stdin")),
OPT_BOOL(0, "append", &opts.append,
N_("include all commits already in the commit-graph file")),
OPT_END(),
};
argc = parse_options(argc, argv, NULL,
builtin_commit_graph_write_options,
builtin_commit_graph_write_usage, 0);
if (opts.reachable + opts.stdin_packs + opts.stdin_commits > 1)
die(_("use at most one of --reachable, --stdin-commits, or --stdin-packs"));
if (!opts.obj_dir)
opts.obj_dir = get_object_directory();
read_replace_refs = 0;
if (opts.reachable) {
commit-graph write: add progress output Before this change the "commit-graph write" command didn't report any progress. On my machine this command takes more than 10 seconds to write the graph for linux.git, and around 1m30s on the 2015-04-03-1M-git.git[1] test repository (a test case for a large monorepository). Furthermore, since the gc.writeCommitGraph setting was added in d5d5d7b641 ("gc: automatically write commit-graph files", 2018-06-27), there was no indication at all from a "git gc" run that anything was different. This why one of the progress bars being added here uses start_progress() instead of start_delayed_progress(), so that it's guaranteed to be seen. E.g. on my tiny 867 commit dotfiles.git repository: $ git -c gc.writeCommitGraph=true gc Enumerating objects: 2821, done. [...] Computing commit graph generation numbers: 100% (867/867), done. On larger repositories, such as linux.git the delayed progress bar(s) will kick in, and we'll show what's going on instead of, as was previously happening, printing nothing while we write the graph: $ git -c gc.writeCommitGraph=true gc [...] Annotating commits in commit graph: 1565573, done. Computing commit graph generation numbers: 100% (782484/782484), done. Note that here we don't show "Finding commits for commit graph", this is because under "git gc" we seed the search with the commit references in the repository, and that set is too small to show any progress, but would e.g. on a smaller repo such as git.git with --stdin-commits: $ git rev-list --all | git -c gc.writeCommitGraph=true write --stdin-commits Finding commits for commit graph: 100% (162576/162576), done. Computing commit graph generation numbers: 100% (162576/162576), done. With --stdin-packs we don't show any estimation of how much is left to do. This is because we might be processing more than one pack. We could be less lazy here and show progress, either by detecting that we're only processing one pack, or by first looping over the packs to discover how many commits they have. I don't see the point in doing that work. So instead we get (on 2015-04-03-1M-git.git): $ echo pack-<HASH>.idx | git -c gc.writeCommitGraph=true --exec-path=$PWD commit-graph write --stdin-packs Finding commits for commit graph: 13064614, done. Annotating commits in commit graph: 3001341, done. Computing commit graph generation numbers: 100% (1000447/1000447), done. No GC mode uses --stdin-packs. It's what they use at Microsoft to manually compute the generation numbers for their collection of large packs which are never coalesced. The reason we need a "report_progress" variable passed down from "git gc" is so that we don't report this output when we're running in the process "git gc --auto" detaches from the terminal. Since we write the commit graph from the "git gc" process itself (as opposed to what we do with say the "git repack" phase), we'd end up writing the output to .git/gc.log and reporting it to the user next time as part of the "The last gc run reported the following[...]" error, see 329e6e8794 ("gc: save log from daemonized gc --auto and print it next time", 2015-09-19). So we must keep track of whether or not we're running in that demonized mode, and if so print no progress. See [2] and subsequent replies for a discussion of an approach not taken in compute_generation_numbers(). I.e. we're saying "Computing commit graph generation numbers", even though on an established history we're mostly skipping over all the work we did in the past. This is similar to the white lie we tell in the "Writing objects" phase (not all are objects being written). Always showing progress is considered more important than accuracy. I.e. on a repository like 2015-04-03-1M-git.git we'd hang for 6 seconds with no output on the second "git gc" if no changes were made to any objects in the interim if we'd take the approach in [2]. 1. https://github.com/avar/2015-04-03-1M-git 2. <c6960252-c095-fb2b-e0bc-b1e6bb261614@gmail.com> (https://public-inbox.org/git/c6960252-c095-fb2b-e0bc-b1e6bb261614@gmail.com/) Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-09-17 15:33:35 +00:00
write_commit_graph_reachable(opts.obj_dir, opts.append, 1);
return 0;
}
string_list_init(&lines, 0);
if (opts.stdin_packs || opts.stdin_commits) {
struct strbuf buf = STRBUF_INIT;
while (strbuf_getline(&buf, stdin) != EOF)
string_list_append(&lines, strbuf_detach(&buf, NULL));
if (opts.stdin_packs)
pack_indexes = &lines;
if (opts.stdin_commits)
commit_hex = &lines;
UNLEAK(buf);
}
write_commit_graph(opts.obj_dir,
pack_indexes,
commit_hex,
commit-graph write: add progress output Before this change the "commit-graph write" command didn't report any progress. On my machine this command takes more than 10 seconds to write the graph for linux.git, and around 1m30s on the 2015-04-03-1M-git.git[1] test repository (a test case for a large monorepository). Furthermore, since the gc.writeCommitGraph setting was added in d5d5d7b641 ("gc: automatically write commit-graph files", 2018-06-27), there was no indication at all from a "git gc" run that anything was different. This why one of the progress bars being added here uses start_progress() instead of start_delayed_progress(), so that it's guaranteed to be seen. E.g. on my tiny 867 commit dotfiles.git repository: $ git -c gc.writeCommitGraph=true gc Enumerating objects: 2821, done. [...] Computing commit graph generation numbers: 100% (867/867), done. On larger repositories, such as linux.git the delayed progress bar(s) will kick in, and we'll show what's going on instead of, as was previously happening, printing nothing while we write the graph: $ git -c gc.writeCommitGraph=true gc [...] Annotating commits in commit graph: 1565573, done. Computing commit graph generation numbers: 100% (782484/782484), done. Note that here we don't show "Finding commits for commit graph", this is because under "git gc" we seed the search with the commit references in the repository, and that set is too small to show any progress, but would e.g. on a smaller repo such as git.git with --stdin-commits: $ git rev-list --all | git -c gc.writeCommitGraph=true write --stdin-commits Finding commits for commit graph: 100% (162576/162576), done. Computing commit graph generation numbers: 100% (162576/162576), done. With --stdin-packs we don't show any estimation of how much is left to do. This is because we might be processing more than one pack. We could be less lazy here and show progress, either by detecting that we're only processing one pack, or by first looping over the packs to discover how many commits they have. I don't see the point in doing that work. So instead we get (on 2015-04-03-1M-git.git): $ echo pack-<HASH>.idx | git -c gc.writeCommitGraph=true --exec-path=$PWD commit-graph write --stdin-packs Finding commits for commit graph: 13064614, done. Annotating commits in commit graph: 3001341, done. Computing commit graph generation numbers: 100% (1000447/1000447), done. No GC mode uses --stdin-packs. It's what they use at Microsoft to manually compute the generation numbers for their collection of large packs which are never coalesced. The reason we need a "report_progress" variable passed down from "git gc" is so that we don't report this output when we're running in the process "git gc --auto" detaches from the terminal. Since we write the commit graph from the "git gc" process itself (as opposed to what we do with say the "git repack" phase), we'd end up writing the output to .git/gc.log and reporting it to the user next time as part of the "The last gc run reported the following[...]" error, see 329e6e8794 ("gc: save log from daemonized gc --auto and print it next time", 2015-09-19). So we must keep track of whether or not we're running in that demonized mode, and if so print no progress. See [2] and subsequent replies for a discussion of an approach not taken in compute_generation_numbers(). I.e. we're saying "Computing commit graph generation numbers", even though on an established history we're mostly skipping over all the work we did in the past. This is similar to the white lie we tell in the "Writing objects" phase (not all are objects being written). Always showing progress is considered more important than accuracy. I.e. on a repository like 2015-04-03-1M-git.git we'd hang for 6 seconds with no output on the second "git gc" if no changes were made to any objects in the interim if we'd take the approach in [2]. 1. https://github.com/avar/2015-04-03-1M-git 2. <c6960252-c095-fb2b-e0bc-b1e6bb261614@gmail.com> (https://public-inbox.org/git/c6960252-c095-fb2b-e0bc-b1e6bb261614@gmail.com/) Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-09-17 15:33:35 +00:00
opts.append,
1);
UNLEAK(lines);
return 0;
}
int cmd_commit_graph(int argc, const char **argv, const char *prefix)
{
static struct option builtin_commit_graph_options[] = {
OPT_STRING(0, "object-dir", &opts.obj_dir,
N_("dir"),
N_("The object directory to store the graph")),
OPT_END(),
};
if (argc == 2 && !strcmp(argv[1], "-h"))
usage_with_options(builtin_commit_graph_usage,
builtin_commit_graph_options);
git_config(git_default_config, NULL);
argc = parse_options(argc, argv, prefix,
builtin_commit_graph_options,
builtin_commit_graph_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
if (argc > 0) {
if (!strcmp(argv[0], "read"))
return graph_read(argc, argv);
if (!strcmp(argv[0], "verify"))
return graph_verify(argc, argv);
if (!strcmp(argv[0], "write"))
return graph_write(argc, argv);
}
usage_with_options(builtin_commit_graph_usage,
builtin_commit_graph_options);
}