git/list-objects.c
Jeff King 200abe7458 list-objects: only look at cmdline trees with edge_hint
When rev-list is given a command-line like:

  git rev-list --objects $commit --not --all

the most accurate answer is the difference between the set
of objects reachable from $commit and the set reachable from
all of the existing refs. However, we have not historically
provided that answer, because it is very expensive to
calculate. We would have to open every tree of every commit
in the entire history.

Instead, we find the accurate set difference of the
reachable commits, and then mark the trees at the boundaries
as uninteresting. This misses objects which appear in the
trees of both the interesting commits and deep within the
uninteresting history.

Commit fbd4a70 (list-objects: mark more commits as edges in
mark_edges_uninteresting, 2013-08-16) noticed that we miss
those objects during pack-objects, and added code to examine
the trees of all of the "--not" refs given on the
command-line.  Note that this is still not the complete set
difference, because we look only at the tips of the
command-line arguments, not all of their reachable commits.
But it increases the set of boundary objects we consider,
which is especially important for shallow fetches.  So we
are trading extra CPU time for a larger set of boundary
objects, which can improve the resulting pack size for a
--thin pack.

This tradeoff probably makes sense in the context of
pack-objects, where we have set revs->edge_hint to have the
traversal feed us the set of boundary objects.  For a
regular rev-list, though, it is probably not a good
tradeoff. It is true that it makes our list slightly closer
to a true set difference, but it is a rare case where this
is important. And because we do not have revs->edge_hint
set, we do nothing useful with the larger set of boundary
objects.

This patch therefore ties the extra tree examination to the
revs->edge_hint flag; it is the presence of that flag that
makes the tradeoff worthwhile.

Here is output from the p0001-rev-list showing the
improvement in performance:

Test                                             HEAD^             HEAD
-----------------------------------------------------------------------------------------
0001.1: rev-list --all                           0.69(0.65+0.02)   0.69(0.66+0.02) +0.0%
0001.2: rev-list --all --objects                 3.22(3.19+0.03)   3.23(3.20+0.03) +0.3%
0001.4: rev-list $commit --not --all             0.04(0.04+0.00)   0.04(0.04+0.00) +0.0%
0001.5: rev-list --objects $commit --not --all   0.27(0.26+0.01)   0.04(0.04+0.00) -85.2%

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-01-21 14:46:24 -08:00

237 lines
6 KiB
C

#include "cache.h"
#include "tag.h"
#include "commit.h"
#include "tree.h"
#include "blob.h"
#include "diff.h"
#include "tree-walk.h"
#include "revision.h"
#include "list-objects.h"
static void process_blob(struct rev_info *revs,
struct blob *blob,
show_object_fn show,
struct name_path *path,
const char *name,
void *cb_data)
{
struct object *obj = &blob->object;
if (!revs->blob_objects)
return;
if (!obj)
die("bad blob object");
if (obj->flags & (UNINTERESTING | SEEN))
return;
obj->flags |= SEEN;
show(obj, path, name, cb_data);
}
/*
* Processing a gitlink entry currently does nothing, since
* we do not recurse into the subproject.
*
* We *could* eventually add a flag that actually does that,
* which would involve:
* - is the subproject actually checked out?
* - if so, see if the subproject has already been added
* to the alternates list, and add it if not.
* - process the commit (or tag) the gitlink points to
* recursively.
*
* However, it's unclear whether there is really ever any
* reason to see superprojects and subprojects as such a
* "unified" object pool (potentially resulting in a totally
* humongous pack - avoiding which was the whole point of
* having gitlinks in the first place!).
*
* So for now, there is just a note that we *could* follow
* the link, and how to do it. Whether it necessarily makes
* any sense what-so-ever to ever do that is another issue.
*/
static void process_gitlink(struct rev_info *revs,
const unsigned char *sha1,
show_object_fn show,
struct name_path *path,
const char *name,
void *cb_data)
{
/* Nothing to do */
}
static void process_tree(struct rev_info *revs,
struct tree *tree,
show_object_fn show,
struct name_path *path,
struct strbuf *base,
const char *name,
void *cb_data)
{
struct object *obj = &tree->object;
struct tree_desc desc;
struct name_entry entry;
struct name_path me;
enum interesting match = revs->diffopt.pathspec.nr == 0 ?
all_entries_interesting: entry_not_interesting;
int baselen = base->len;
if (!revs->tree_objects)
return;
if (!obj)
die("bad tree object");
if (obj->flags & (UNINTERESTING | SEEN))
return;
if (parse_tree(tree) < 0)
die("bad tree object %s", sha1_to_hex(obj->sha1));
obj->flags |= SEEN;
show(obj, path, name, cb_data);
me.up = path;
me.elem = name;
me.elem_len = strlen(name);
if (!match) {
strbuf_addstr(base, name);
if (base->len)
strbuf_addch(base, '/');
}
init_tree_desc(&desc, tree->buffer, tree->size);
while (tree_entry(&desc, &entry)) {
if (match != all_entries_interesting) {
match = tree_entry_interesting(&entry, base, 0,
&revs->diffopt.pathspec);
if (match == all_entries_not_interesting)
break;
if (match == entry_not_interesting)
continue;
}
if (S_ISDIR(entry.mode))
process_tree(revs,
lookup_tree(entry.sha1),
show, &me, base, entry.path,
cb_data);
else if (S_ISGITLINK(entry.mode))
process_gitlink(revs, entry.sha1,
show, &me, entry.path,
cb_data);
else
process_blob(revs,
lookup_blob(entry.sha1),
show, &me, entry.path,
cb_data);
}
strbuf_setlen(base, baselen);
free(tree->buffer);
tree->buffer = NULL;
}
static void mark_edge_parents_uninteresting(struct commit *commit,
struct rev_info *revs,
show_edge_fn show_edge)
{
struct commit_list *parents;
for (parents = commit->parents; parents; parents = parents->next) {
struct commit *parent = parents->item;
if (!(parent->object.flags & UNINTERESTING))
continue;
mark_tree_uninteresting(parent->tree);
if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
parent->object.flags |= SHOWN;
show_edge(parent);
}
}
}
void mark_edges_uninteresting(struct rev_info *revs, show_edge_fn show_edge)
{
struct commit_list *list;
int i;
for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
if (commit->object.flags & UNINTERESTING) {
mark_tree_uninteresting(commit->tree);
if (revs->edge_hint && !(commit->object.flags & SHOWN)) {
commit->object.flags |= SHOWN;
show_edge(commit);
}
continue;
}
mark_edge_parents_uninteresting(commit, revs, show_edge);
}
if (revs->edge_hint) {
for (i = 0; i < revs->cmdline.nr; i++) {
struct object *obj = revs->cmdline.rev[i].item;
struct commit *commit = (struct commit *)obj;
if (obj->type != OBJ_COMMIT || !(obj->flags & UNINTERESTING))
continue;
mark_tree_uninteresting(commit->tree);
if (!(obj->flags & SHOWN)) {
obj->flags |= SHOWN;
show_edge(commit);
}
}
}
}
static void add_pending_tree(struct rev_info *revs, struct tree *tree)
{
add_pending_object(revs, &tree->object, "");
}
void traverse_commit_list(struct rev_info *revs,
show_commit_fn show_commit,
show_object_fn show_object,
void *data)
{
int i;
struct commit *commit;
struct strbuf base;
strbuf_init(&base, PATH_MAX);
while ((commit = get_revision(revs)) != NULL) {
/*
* an uninteresting boundary commit may not have its tree
* parsed yet, but we are not going to show them anyway
*/
if (commit->tree)
add_pending_tree(revs, commit->tree);
show_commit(commit, data);
}
for (i = 0; i < revs->pending.nr; i++) {
struct object_array_entry *pending = revs->pending.objects + i;
struct object *obj = pending->item;
const char *name = pending->name;
if (obj->flags & (UNINTERESTING | SEEN))
continue;
if (obj->type == OBJ_TAG) {
obj->flags |= SEEN;
show_object(obj, NULL, name, data);
continue;
}
if (obj->type == OBJ_TREE) {
process_tree(revs, (struct tree *)obj, show_object,
NULL, &base, name, data);
continue;
}
if (obj->type == OBJ_BLOB) {
process_blob(revs, (struct blob *)obj, show_object,
NULL, name, data);
continue;
}
die("unknown pending object %s (%s)",
sha1_to_hex(obj->sha1), name);
}
if (revs->pending.nr) {
free(revs->pending.objects);
revs->pending.nr = 0;
revs->pending.alloc = 0;
revs->pending.objects = NULL;
}
strbuf_release(&base);
}