From be12681896fab9455eb65ea124df423b462e0072 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 18 Oct 2011 00:21:21 -0500 Subject: [PATCH 1/4] pack-objects: mark add_to_write_order() as inline This function is a whole 26 bytes when compiled on x86_64, but is currently invoked over 1.037 billion times when running pack-objects on the Linux kernel git repository. This is hitting the point where micro-optimizations do make a difference, and inlining it only increases the object file size by 38 bytes. As reported by perf, this dropped task-clock from 84183 to 83373 ms, and total cycles from 223.5 billion to 221.6 billion. Not astronomical, but worth getting for adding one word. Signed-off-by: Dan McGee Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index a9c67c18ba..70b757e792 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -454,7 +454,7 @@ static int mark_tagged(const char *path, const unsigned char *sha1, int flag, return 0; } -static void add_to_write_order(struct object_entry **wo, +static inline void add_to_write_order(struct object_entry **wo, int *endp, struct object_entry *e) { From 92bef1a14a6755ce1407a0e180cdc9e14a5c56b9 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 18 Oct 2011 00:21:22 -0500 Subject: [PATCH 2/4] pack-objects: use unsigned int for counter and offset values This is done in some of the new pack layout code introduced in commit 1b4bb16b9ec331c. This more closely matches the nr_objects global that is unsigned that these variables are based off of and bounded by. Signed-off-by: Dan McGee Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 70b757e792..865a7d471a 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -455,7 +455,7 @@ static int mark_tagged(const char *path, const unsigned char *sha1, int flag, } static inline void add_to_write_order(struct object_entry **wo, - int *endp, + unsigned int *endp, struct object_entry *e) { if (e->filled) @@ -465,7 +465,7 @@ static inline void add_to_write_order(struct object_entry **wo, } static void add_descendants_to_write_order(struct object_entry **wo, - int *endp, + unsigned int *endp, struct object_entry *e) { struct object_entry *child; @@ -477,7 +477,7 @@ static void add_descendants_to_write_order(struct object_entry **wo, } static void add_family_to_write_order(struct object_entry **wo, - int *endp, + unsigned int *endp, struct object_entry *e) { struct object_entry *root; @@ -490,7 +490,7 @@ static void add_family_to_write_order(struct object_entry **wo, static struct object_entry **compute_write_order(void) { - int i, wo_end; + unsigned int i, wo_end; struct object_entry **wo = xmalloc(nr_objects * sizeof(*wo)); @@ -506,8 +506,8 @@ static struct object_entry **compute_write_order(void) * Make sure delta_sibling is sorted in the original * recency order. */ - for (i = nr_objects - 1; 0 <= i; i--) { - struct object_entry *e = &objects[i]; + for (i = nr_objects; i > 0;) { + struct object_entry *e = &objects[--i]; if (!e->delta) continue; /* Mark me as the first child */ From f380872f0abc7fe98022696996d346df99c53f1a Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 18 Oct 2011 00:21:24 -0500 Subject: [PATCH 3/4] pack-objects: rewrite add_descendants_to_write_order() iteratively This removes the need to call this function recursively, shinking the code size slightly and netting a small performance increase. Signed-off-by: Dan McGee Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 44 +++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 865a7d471a..5b544bf444 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -468,12 +468,43 @@ static void add_descendants_to_write_order(struct object_entry **wo, unsigned int *endp, struct object_entry *e) { - struct object_entry *child; - - for (child = e->delta_child; child; child = child->delta_sibling) - add_to_write_order(wo, endp, child); - for (child = e->delta_child; child; child = child->delta_sibling) - add_descendants_to_write_order(wo, endp, child); + int add_to_order = 1; + while (e) { + if (add_to_order) { + struct object_entry *s; + /* add this node... */ + add_to_write_order(wo, endp, e); + /* all its siblings... */ + for (s = e->delta_sibling; s; s = s->delta_sibling) { + add_to_write_order(wo, endp, s); + } + } + /* drop down a level to add left subtree nodes if possible */ + if (e->delta_child) { + add_to_order = 1; + e = e->delta_child; + } else { + add_to_order = 0; + /* our sibling might have some children, it is next */ + if (e->delta_sibling) { + e = e->delta_sibling; + continue; + } + /* go back to our parent node */ + e = e->delta; + while (e && !e->delta_sibling) { + /* we're on the right side of a subtree, keep + * going up until we can go right again */ + e = e->delta; + } + if (!e) { + /* done- we hit our original root node */ + return; + } + /* pass it off to sibling at this level */ + e = e->delta_sibling; + } + }; } static void add_family_to_write_order(struct object_entry **wo, @@ -484,7 +515,6 @@ static void add_family_to_write_order(struct object_entry **wo, for (root = e; root->delta; root = root->delta) ; /* nothing */ - add_to_write_order(wo, endp, root); add_descendants_to_write_order(wo, endp, root); } From 38d4debb6d180ca53fcb12b8115e81fd4c5262d0 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 18 Oct 2011 00:21:23 -0500 Subject: [PATCH 4/4] pack-objects: don't traverse objects unnecessarily This brings back some of the performance lost in optimizing recency order inside pack objects. We were doing extreme amounts of object re-traversal: for the 2.14 million objects in the Linux kernel repository, we were calling add_to_write_order() over 1.03 billion times (a 0.2% hit rate, making 99.8% of of these calls extraneous). Two optimizations take place here- we can start our objects array iteration from a known point where we left off before we started trying to find our tags, and we don't need to do the deep dives required by add_family_to_write_order() if the object has already been marked as filled. These two optimizations bring some pretty spectacular results via `perf stat`: task-clock: 83373 ms --> 43800 ms (50% faster) cycles: 221,633,461,676 --> 116,307,209,986 (47% fewer) instructions: 149,299,179,939 --> 122,998,800,184 (18% fewer) Helped-by: Ramsay Jones (format string fix in "die" message) Signed-off-by: Dan McGee Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 5b544bf444..80ab6c39f9 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -520,7 +520,7 @@ static void add_family_to_write_order(struct object_entry **wo, static struct object_entry **compute_write_order(void) { - unsigned int i, wo_end; + unsigned int i, wo_end, last_untagged; struct object_entry **wo = xmalloc(nr_objects * sizeof(*wo)); @@ -551,7 +551,7 @@ static struct object_entry **compute_write_order(void) for_each_tag_ref(mark_tagged, NULL); /* - * Give the commits in the original recency order until + * Give the objects in the original recency order until * we see a tagged tip. */ for (i = wo_end = 0; i < nr_objects; i++) { @@ -559,6 +559,7 @@ static struct object_entry **compute_write_order(void) break; add_to_write_order(wo, &wo_end, &objects[i]); } + last_untagged = i; /* * Then fill all the tagged tips. @@ -571,7 +572,7 @@ static struct object_entry **compute_write_order(void) /* * And then all remaining commits and tags. */ - for (i = 0; i < nr_objects; i++) { + for (i = last_untagged; i < nr_objects; i++) { if (objects[i].type != OBJ_COMMIT && objects[i].type != OBJ_TAG) continue; @@ -581,7 +582,7 @@ static struct object_entry **compute_write_order(void) /* * And then all the trees. */ - for (i = 0; i < nr_objects; i++) { + for (i = last_untagged; i < nr_objects; i++) { if (objects[i].type != OBJ_TREE) continue; add_to_write_order(wo, &wo_end, &objects[i]); @@ -590,8 +591,13 @@ static struct object_entry **compute_write_order(void) /* * Finally all the rest in really tight order */ - for (i = 0; i < nr_objects; i++) - add_family_to_write_order(wo, &wo_end, &objects[i]); + for (i = last_untagged; i < nr_objects; i++) { + if (!objects[i].filled) + add_family_to_write_order(wo, &wo_end, &objects[i]); + } + + if (wo_end != nr_objects) + die("ordered %u objects, expected %"PRIu32, wo_end, nr_objects); return wo; }