From 26a66a6b1c653bc6c05534016992985d48267d70 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:40 +0000 Subject: [PATCH 1/9] diffcore-rename: rename num_create to num_destinations Our main data structures are rename_src and rename_dst. For counters of these data structures, num_sources and num_destinations seem natural; definitely more so than using num_create for the latter. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index d367a6d244..15a98f566e 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -434,7 +434,7 @@ static void record_if_better(struct diff_score m[], struct diff_score *o) * 1 if we need to disable inexact rename detection; * 2 if we would be under the limit if we were given -C instead of -C -C. */ -static int too_many_rename_candidates(int num_create, +static int too_many_rename_candidates(int num_destinations, struct diff_options *options) { int rename_limit = options->rename_limit; @@ -447,17 +447,17 @@ static int too_many_rename_candidates(int num_create, * This basically does a test for the rename matrix not * growing larger than a "rename_limit" square matrix, ie: * - * num_create * num_src > rename_limit * rename_limit + * num_destinations * num_src > rename_limit * rename_limit */ if (rename_limit <= 0) rename_limit = 32767; - if ((num_create <= rename_limit || num_src <= rename_limit) && - ((uint64_t)num_create * (uint64_t)num_src + if ((num_destinations <= rename_limit || num_src <= rename_limit) && + ((uint64_t)num_destinations * (uint64_t)num_src <= (uint64_t)rename_limit * (uint64_t)rename_limit)) return 0; options->needed_rename_limit = - num_src > num_create ? num_src : num_create; + num_src > num_destinations ? num_src : num_destinations; /* Are we running under -C -C? */ if (!options->flags.find_copies_harder) @@ -469,8 +469,8 @@ static int too_many_rename_candidates(int num_create, continue; num_src++; } - if ((num_create <= rename_limit || num_src <= rename_limit) && - ((uint64_t)num_create * (uint64_t)num_src + if ((num_destinations <= rename_limit || num_src <= rename_limit) && + ((uint64_t)num_destinations * (uint64_t)num_src <= (uint64_t)rename_limit * (uint64_t)rename_limit)) return 2; return 1; @@ -505,7 +505,7 @@ void diffcore_rename(struct diff_options *options) struct diff_queue_struct outq; struct diff_score *mx; int i, j, rename_count, skip_unmodified = 0; - int num_create, dst_cnt; + int num_destinations, dst_cnt; struct progress *progress = NULL; if (!minimum_score) @@ -570,13 +570,13 @@ void diffcore_rename(struct diff_options *options) * Calculate how many renames are left (but all the source * files still remain as options for rename/copies!) */ - num_create = (rename_dst_nr - rename_count); + num_destinations = (rename_dst_nr - rename_count); /* All done? */ - if (!num_create) + if (!num_destinations) goto cleanup; - switch (too_many_rename_candidates(num_create, options)) { + switch (too_many_rename_candidates(num_destinations, options)) { case 1: goto cleanup; case 2: @@ -593,7 +593,8 @@ void diffcore_rename(struct diff_options *options) (uint64_t)rename_dst_nr * (uint64_t)rename_src_nr); } - mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_create), sizeof(*mx)); + mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations), + sizeof(*mx)); for (dst_cnt = i = 0; i < rename_dst_nr; i++) { struct diff_filespec *two = rename_dst[i].two; struct diff_score *m; From 00b8cccdd83c6f8c9ffefd133b291dadf8e788d7 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:41 +0000 Subject: [PATCH 2/9] diffcore-rename: avoid usage of global in too_many_rename_candidates() too_many_rename_candidates() got the number of rename destinations via an argument to the function, but the number of rename sources via a global variable. That felt rather inconsistent. Pass in the number of rename sources as an argument as well. While we are at it... We had a local variable, num_src, that served two purposes. Initially it was set to the global value, but later was used for counting a subset of the number of sources. Since we now have a function argument for the former usage, introduce a clearer variable name for the latter usage. This patch has no behavioral changes; it's just renaming and passing an argument instead of grabbing it from the global namespace. (You may find it easier to view the patch using git diff's --color-words option.) Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 15a98f566e..1d6675c040 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -434,12 +434,11 @@ static void record_if_better(struct diff_score m[], struct diff_score *o) * 1 if we need to disable inexact rename detection; * 2 if we would be under the limit if we were given -C instead of -C -C. */ -static int too_many_rename_candidates(int num_destinations, +static int too_many_rename_candidates(int num_destinations, int num_sources, struct diff_options *options) { int rename_limit = options->rename_limit; - int num_src = rename_src_nr; - int i; + int i, limited_sources; options->needed_rename_limit = 0; @@ -447,30 +446,30 @@ static int too_many_rename_candidates(int num_destinations, * This basically does a test for the rename matrix not * growing larger than a "rename_limit" square matrix, ie: * - * num_destinations * num_src > rename_limit * rename_limit + * num_destinations * num_sources > rename_limit * rename_limit */ if (rename_limit <= 0) rename_limit = 32767; - if ((num_destinations <= rename_limit || num_src <= rename_limit) && - ((uint64_t)num_destinations * (uint64_t)num_src + if ((num_destinations <= rename_limit || num_sources <= rename_limit) && + ((uint64_t)num_destinations * (uint64_t)num_sources <= (uint64_t)rename_limit * (uint64_t)rename_limit)) return 0; options->needed_rename_limit = - num_src > num_destinations ? num_src : num_destinations; + num_sources > num_destinations ? num_sources : num_destinations; /* Are we running under -C -C? */ if (!options->flags.find_copies_harder) return 1; /* Would we bust the limit if we were running under -C? */ - for (num_src = i = 0; i < rename_src_nr; i++) { + for (limited_sources = i = 0; i < num_sources; i++) { if (diff_unmodified_pair(rename_src[i].p)) continue; - num_src++; + limited_sources++; } - if ((num_destinations <= rename_limit || num_src <= rename_limit) && - ((uint64_t)num_destinations * (uint64_t)num_src + if ((num_destinations <= rename_limit || limited_sources <= rename_limit) && + ((uint64_t)num_destinations * (uint64_t)limited_sources <= (uint64_t)rename_limit * (uint64_t)rename_limit)) return 2; return 1; @@ -576,7 +575,8 @@ void diffcore_rename(struct diff_options *options) if (!num_destinations) goto cleanup; - switch (too_many_rename_candidates(num_destinations, options)) { + switch (too_many_rename_candidates(num_destinations, rename_src_nr, + options)) { case 1: goto cleanup; case 2: From ad8a1be529ea9e0bd6346a3cb1d53fec16e3bd10 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:42 +0000 Subject: [PATCH 3/9] diffcore-rename: simplify limit check diffcore-rename had two different checks of the form if ((a < limit || b < limit) && a * b <= limit * limit) This can be simplified to if (st_mult(a, b) <= st_mult(limit, limit)) which makes it clearer how we are checking for overflow, and makes it much easier to parse given the drop from 8 to 4 variable appearances. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 1d6675c040..16553ab259 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -447,12 +447,16 @@ static int too_many_rename_candidates(int num_destinations, int num_sources, * growing larger than a "rename_limit" square matrix, ie: * * num_destinations * num_sources > rename_limit * rename_limit + * + * We use st_mult() to check overflow conditions; in the + * exceptional circumstance that size_t isn't large enough to hold + * the multiplication, the system won't be able to allocate enough + * memory for the matrix anyway. */ if (rename_limit <= 0) rename_limit = 32767; - if ((num_destinations <= rename_limit || num_sources <= rename_limit) && - ((uint64_t)num_destinations * (uint64_t)num_sources - <= (uint64_t)rename_limit * (uint64_t)rename_limit)) + if (st_mult(num_destinations, num_sources) + <= st_mult(rename_limit, rename_limit)) return 0; options->needed_rename_limit = @@ -468,9 +472,8 @@ static int too_many_rename_candidates(int num_destinations, int num_sources, continue; limited_sources++; } - if ((num_destinations <= rename_limit || limited_sources <= rename_limit) && - ((uint64_t)num_destinations * (uint64_t)limited_sources - <= (uint64_t)rename_limit * (uint64_t)rename_limit)) + if (st_mult(num_destinations, limited_sources) + <= st_mult(rename_limit, rename_limit)) return 2; return 1; } From 81c4bf02964e51d0cde79304794d51da86d23b09 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:43 +0000 Subject: [PATCH 4/9] diffcore-rename: reduce jumpiness in progress counters Inexact rename detection works by comparing all sources to all destinations, computing similarities, and then finding the best matches among those that are sufficiently similar. However, it is preceded by exact rename detection that works by checking if there are files with identical hashes. If exact renames are found, we can exclude some files from inexact rename detection. The inexact rename detection loops over the full set of files, but immediately skips those for which rename_dst[i].is_rename is true and thus doesn't compare any sources to that destination. As such, these paths shouldn't be included in the progress counter. For the eagle eyed, this change hints at an actual optimization -- the first one I presented at Git Merge 2020. I'll be submitting that optimization later, once the basic merge-ort algorithm has merged. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 16553ab259..55a188abcc 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -593,7 +593,7 @@ void diffcore_rename(struct diff_options *options) if (options->show_rename_progress) { progress = start_delayed_progress( _("Performing inexact rename detection"), - (uint64_t)rename_dst_nr * (uint64_t)rename_src_nr); + (uint64_t)num_destinations * (uint64_t)rename_src_nr); } mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations), @@ -633,7 +633,8 @@ void diffcore_rename(struct diff_options *options) diff_free_filespec_blob(two); } dst_cnt++; - display_progress(progress, (uint64_t)(i+1)*(uint64_t)rename_src_nr); + display_progress(progress, + (uint64_t)dst_cnt * (uint64_t)rename_src_nr); } stop_progress(&progress); From 5c72261c664e330e4fec7b9374896ba4fd75ad9f Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:44 +0000 Subject: [PATCH 5/9] t4058: add more tests and documentation for duplicate tree entry handling Commit 4d6be03b95 ("diffcore-rename: avoid processing duplicate destinations", 2015-02-26) added t4058 to demonstrate that a workaround it added to avoid double frees (namely to just turn off rename detection when trees had duplicate entries) would indeed avoid segfaults. The tests, though, give the impression that the expected diffs are "correct" when in reality they are just "don't segfault, and do something semi-reasonable under the circumstances". Add some notes to make this clearer. Also, commit 25d5ea410f ("[PATCH] Redo rename/copy detection logic.", 2005-05-24) added a similar workaround to avoid segfaults, but for rename_src rather than rename_dst. I do not see any tests in the testsuite to cover the collision detection of entries limited to the source side, so add a couple. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- t/t4058-diff-duplicates.sh | 47 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/t/t4058-diff-duplicates.sh b/t/t4058-diff-duplicates.sh index c24ee175ef..bd68508956 100755 --- a/t/t4058-diff-duplicates.sh +++ b/t/t4058-diff-duplicates.sh @@ -1,5 +1,14 @@ #!/bin/sh +# NOTICE: +# This testsuite does a number of diffs and checks that the output match. +# However, it is a "garbage in, garbage out" situation; the trees have +# duplicate entries for individual paths, and it results in diffs that do +# not make much sense. As such, it is not clear that the diffs are +# "correct". The primary purpose of these tests was to verify that +# diff-tree does not segfault, but there is perhaps some value in ensuring +# that the diff output isn't wildly unreasonable. + test_description='test tree diff when trees have duplicate entries' . ./test-lib.sh @@ -57,7 +66,16 @@ test_expect_success 'create trees with duplicate entries' ' git tag two $outer_two ' -test_expect_success 'diff-tree between trees' ' +test_expect_success 'create tree without duplicate entries' ' + blob_one=$(echo one | git hash-object -w --stdin) && + outer_three=$(make_tree \ + 100644 renamed $blob_one + ) && + git tag three $outer_three +' + +test_expect_success 'diff-tree between duplicate trees' ' + # See NOTICE at top of file { printf ":000000 100644 $ZERO_OID $blob_two A\touter/inner\n" && printf ":000000 100644 $ZERO_OID $blob_two A\touter/inner\n" && @@ -71,9 +89,34 @@ test_expect_success 'diff-tree between trees' ' ' test_expect_success 'diff-tree with renames' ' - # same expectation as above, since we disable rename detection + # See NOTICE at top of file. git diff-tree -M -r --no-abbrev one two >actual && test_cmp expect actual ' +test_expect_success 'diff-tree FROM duplicate tree' ' + # See NOTICE at top of file. + { + printf ":100644 000000 $blob_one $ZERO_OID D\touter/inner\n" && + printf ":100644 000000 $blob_two $ZERO_OID D\touter/inner\n" && + printf ":100644 000000 $blob_two $ZERO_OID D\touter/inner\n" && + printf ":100644 000000 $blob_two $ZERO_OID D\touter/inner\n" && + printf ":000000 100644 $ZERO_OID $blob_one A\trenamed\n" + } >expect && + git diff-tree -r --no-abbrev one three >actual && + test_cmp expect actual +' + +test_expect_success 'diff-tree FROM duplicate tree, with renames' ' + # See NOTICE at top of file. + { + printf ":100644 000000 $blob_two $ZERO_OID D\touter/inner\n" && + printf ":100644 000000 $blob_two $ZERO_OID D\touter/inner\n" && + printf ":100644 000000 $blob_two $ZERO_OID D\touter/inner\n" && + printf ":100644 100644 $blob_one $blob_one R100\touter/inner\trenamed\n" + } >expect && + git diff-tree -M -r --no-abbrev one three >actual && + test_cmp expect actual +' + test_done From ac14de13b228285b798ed805812fe20d1bc55eb2 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:45 +0000 Subject: [PATCH 6/9] t4058: explore duplicate tree entry handling in a bit more detail While creating the last commit, I found a number of other cases where git would segfault when faced with trees that have duplicate entries. None of these segfaults are in the diffcore-rename code (they all occur in cache-tree and unpack-trees). Further, to my knowledge, no one has ever been adversely affected by these bugs, and given that it has been 15 years and folks have fixed a few other issues with historical duplicate entries (as noted in the last commit), I am not sure we will ever run into anyone having problems with these. So I am not sure these are worth fixing, but it doesn't hurt to at least document these failures in the same test file that is concerned with duplicate tree entries. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- t/t4058-diff-duplicates.sh | 67 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/t/t4058-diff-duplicates.sh b/t/t4058-diff-duplicates.sh index bd68508956..ad3f37d388 100755 --- a/t/t4058-diff-duplicates.sh +++ b/t/t4058-diff-duplicates.sh @@ -119,4 +119,71 @@ test_expect_success 'diff-tree FROM duplicate tree, with renames' ' test_cmp expect actual ' +test_expect_success 'create a few commits' ' + git commit-tree -m "Duplicate Entries" two^{tree} >commit_id && + git branch base $(cat commit_id) && + + git commit-tree -p $(cat commit_id) -m "Just one" three^{tree} >up && + git branch update $(cat up) && + + git commit-tree -p $(cat up) -m "Back to weird" two^{tree} >final && + git branch final $(cat final) && + + rm commit_id up final +' + +test_expect_failure 'git read-tree does not segfault' ' + test_when_finished rm .git/index.lock && + test_might_fail git read-tree --reset base +' + +test_expect_failure 'reset --hard does not segfault' ' + test_when_finished rm .git/index.lock && + git checkout base && + test_might_fail git reset --hard +' + +test_expect_failure 'git diff HEAD does not segfault' ' + git checkout base && + GIT_TEST_CHECK_CACHE_TREE=false && + git reset --hard && + test_might_fail git diff HEAD +' + +test_expect_failure 'can switch to another branch when status is empty' ' + git clean -ffdqx && + git status --porcelain -uno >actual && + test_must_be_empty actual && + git checkout update +' + +test_expect_success 'forcibly switch to another branch, verify status empty' ' + git checkout -f update && + git status --porcelain -uno >actual && + test_must_be_empty actual +' + +test_expect_success 'fast-forward from non-duplicate entries to duplicate' ' + git merge final +' + +test_expect_failure 'clean status, switch branches, status still clean' ' + git status --porcelain -uno >actual && + test_must_be_empty actual && + git checkout base && + git status --porcelain -uno >actual && + test_must_be_empty actual +' + +test_expect_success 'switch to base branch and force status to be clean' ' + git checkout base && + GIT_TEST_CHECK_CACHE_TREE=false git reset --hard && + git status --porcelain -uno >actual && + test_must_be_empty actual +' + +test_expect_failure 'fast-forward from duplicate entries to non-duplicate' ' + git merge update +' + test_done From b970b4ef62432980bf106c2f0970c25ae0de5cce Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:46 +0000 Subject: [PATCH 7/9] diffcore-rename: simplify and accelerate register_rename_src() register_rename_src() took pains to create an array in rename_src which was sorted by pathname of the contained diff_filepair. The sorting was entirely unnecessary since callers pass filepairs to us in sorted order. We can simply append to the end of the rename_src array, speeding up diffcore_rename() setup time. Also, note that I dropped the return type on the function since it was unconditionally discarded anyway. This patch is being submitted in a different order than its original development, but in a large rebase of many commits with lots of renames and with several optimizations to inexact rename detection, diffcore_rename() setup time was a sizeable chunk of overall runtime. This patch dropped execution time of rebasing 35 commits with lots of renames by 2% overall. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 55a188abcc..a215421a9c 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -76,36 +76,23 @@ static struct diff_rename_src { } *rename_src; static int rename_src_nr, rename_src_alloc; -static struct diff_rename_src *register_rename_src(struct diff_filepair *p) +static void register_rename_src(struct diff_filepair *p) { - int first, last; - struct diff_filespec *one = p->one; - unsigned short score = p->score; + /* + * If we have multiple entries at the same path in the source tree + * (an invalid tree, to be sure), avoid using more more than one + * such entry in rename detection. Once upon a time, doing so + * caused segfaults; see commit 25d5ea410f ("[PATCH] Redo + * rename/copy detection logic.", 2005-05-24). + */ + if (rename_src_nr > 0 && + !strcmp(rename_src[rename_src_nr-1].p->one->path, p->one->path)) + return; - first = 0; - last = rename_src_nr; - while (last > first) { - int next = first + ((last - first) >> 1); - struct diff_rename_src *src = &(rename_src[next]); - int cmp = strcmp(one->path, src->p->one->path); - if (!cmp) - return src; - if (cmp < 0) { - last = next; - continue; - } - first = next+1; - } - - /* insert to make it at "first" */ ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc); + rename_src[rename_src_nr].p = p; + rename_src[rename_src_nr].score = p->score; rename_src_nr++; - if (first < rename_src_nr) - MOVE_ARRAY(rename_src + first + 1, rename_src + first, - rename_src_nr - first - 1); - rename_src[first].p = p; - rename_src[first].score = score; - return &(rename_src[first]); } static int basename_same(struct diff_filespec *src, struct diff_filespec *dst) From 9db2ac56168e58f856b001a93ad369affe18879f Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 11 Dec 2020 09:08:47 +0000 Subject: [PATCH 8/9] diffcore-rename: accelerate rename_dst setup register_rename_src() simply references the passed pair inside rename_src. In contrast, add_rename_dst() did something entirely different for rename_dst. Instead of copying the passed pair, it made a copy of the second diff_filespec from the passed pair, referenced it, and then set the diff_rename_dst.pair field to NULL. Later, when a pairing is found, record_rename_pair() allocated a full diff_filepair via diff_queue() and pointed its src and dst fields at the appropriate diff_filespecs. This contrast between register_rename_src() for the rename_src data structure and add_rename_dst() for the rename_dst data structure is oddly inconsistent and requires more memory and work than necessary. Let's just reference the original diff_filepair in rename_dst as-is, just as we do with rename_src. Add a new rename_dst.is_rename field, since the rename_dst.p field is never NULL unlike the old rename_dst.pair field. Taking advantage of this change and the fact that same-named paths will be adjacent, we can get rid of the sorting of the array and most of the lookups on it, allowing us to instead just append as we go. However, there is one remaining reason to still keep locate_rename_dst(): handling broken pairs (i.e. when break detection is on). Those are somewhat rare, but we can set up a simple strintmap to get the map between the source and the index. Doing that allows us to still have a fast lookup without sorting the rename_dst array. Since the sorting had been done in a weakly quadratic manner, when many renames are involved this time could add up. There is still a strcmp() in add_rename_dst() that I have left in place to make it easier to verify that the algorithm has the same results. This strcmp() is there to check for duplicate destination entries (which was the easiest way at the time to avoid segfaults in the diffcore-rename code when trees had multiple entries at a given path). The underlying double free()s are no longer an issue with the new algorithm, but that can be addressed in a subsequent commit. This patch is being submitted in a different order than its original development, but in a large rebase of many commits with lots of renames and with several optimizations to inexact rename detection, both setup time and write back to output queue time from diffcore_rename() were sizeable chunks of overall runtime. This patch accelerated the setup time by about 65%, and final write back to the output queue time by about 50%, resulting in an overall drop of 3.5% on the execution time of rebasing a few dozen patches. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 148 ++++++++++++++++++++-------------------------- 1 file changed, 65 insertions(+), 83 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index a215421a9c..2a8fcd5292 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -9,63 +9,48 @@ #include "hashmap.h" #include "progress.h" #include "promisor-remote.h" +#include "strmap.h" /* Table of rename/copy destinations */ static struct diff_rename_dst { - struct diff_filespec *two; - struct diff_filepair *pair; + struct diff_filepair *p; + struct diff_filespec *filespec_to_free; + int is_rename; /* false -> just a create; true -> rename or copy */ } *rename_dst; static int rename_dst_nr, rename_dst_alloc; +/* Mapping from break source pathname to break destination index */ +static struct strintmap *break_idx = NULL; -static int find_rename_dst(struct diff_filespec *two) +static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p) { - int first, last; - - first = 0; - last = rename_dst_nr; - while (last > first) { - int next = first + ((last - first) >> 1); - struct diff_rename_dst *dst = &(rename_dst[next]); - int cmp = strcmp(two->path, dst->two->path); - if (!cmp) - return next; - if (cmp < 0) { - last = next; - continue; - } - first = next+1; - } - return -first - 1; -} - -static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two) -{ - int ofs = find_rename_dst(two); - return ofs < 0 ? NULL : &rename_dst[ofs]; + /* Lookup by p->ONE->path */ + int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1; + return (idx == -1) ? NULL : &rename_dst[idx]; } /* * Returns 0 on success, -1 if we found a duplicate. */ -static int add_rename_dst(struct diff_filespec *two) +static int add_rename_dst(struct diff_filepair *p) { - int first = find_rename_dst(two); - - if (first >= 0) + /* + * If we have multiple entries at the same path in the destination + * tree (an invalid tree, to be sure), turn off rename detection + * entirely. Once upon a time, diffcore-rename had double free() + * issues due to such duplicate paths, resulting in segfaults. See + * commit 4d6be03b95 ("diffcore-rename: avoid processing duplicate + * destinations", 2015-02-26). + */ + if (rename_dst_nr > 0 && + !strcmp(rename_dst[rename_dst_nr-1].p->two->path, p->two->path)) return -1; - first = -first - 1; - /* insert to make it at "first" */ ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); + rename_dst[rename_dst_nr].p = p; + rename_dst[rename_dst_nr].filespec_to_free = NULL; + rename_dst[rename_dst_nr].is_rename = 0; rename_dst_nr++; - if (first < rename_dst_nr) - MOVE_ARRAY(rename_dst + first + 1, rename_dst + first, - rename_dst_nr - first - 1); - rename_dst[first].two = alloc_filespec(two->path); - fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid, - two->mode); - rename_dst[first].pair = NULL; return 0; } @@ -89,6 +74,14 @@ static void register_rename_src(struct diff_filepair *p) !strcmp(rename_src[rename_src_nr-1].p->one->path, p->one->path)) return; + if (p->broken_pair) { + if (!break_idx) { + break_idx = xmalloc(sizeof(*break_idx)); + strintmap_init(break_idx, -1); + } + strintmap_set(break_idx, p->one->path, rename_dst_nr); + } + ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc); rename_src[rename_src_nr].p = p; rename_src[rename_src_nr].score = p->score; @@ -128,14 +121,14 @@ static void prefetch(void *prefetch_options) struct oid_array to_fetch = OID_ARRAY_INIT; for (i = 0; i < rename_dst_nr; i++) { - if (rename_dst[i].pair) + if (rename_dst[i].p->renamed_pair) /* * The loop in diffcore_rename() will not need these * blobs, so skip prefetching. */ continue; /* already found exact match */ diff_add_if_missing(options->repo, &to_fetch, - rename_dst[i].two); + rename_dst[i].p->two); } for (i = 0; i < rename_src_nr; i++) { if (options->skip_unmodified && @@ -245,26 +238,24 @@ static int estimate_similarity(struct repository *r, static void record_rename_pair(int dst_index, int src_index, int score) { - struct diff_filespec *src, *dst; - struct diff_filepair *dp; + struct diff_filepair *src = rename_src[src_index].p; + struct diff_filepair *dst = rename_dst[dst_index].p; - if (rename_dst[dst_index].pair) + if (dst->renamed_pair) die("internal error: dst already matched."); - src = rename_src[src_index].p->one; - src->rename_used++; - src->count++; + src->one->rename_used++; + src->one->count++; - dst = rename_dst[dst_index].two; - dst->count++; + rename_dst[dst_index].filespec_to_free = dst->one; + rename_dst[dst_index].is_rename = 1; - dp = diff_queue(NULL, src, dst); - dp->renamed_pair = 1; - if (!strcmp(src->path, dst->path)) - dp->score = rename_src[src_index].score; + dst->one = src->one; + dst->renamed_pair = 1; + if (!strcmp(dst->one->path, dst->two->path)) + dst->score = rename_src[src_index].score; else - dp->score = score; - rename_dst[dst_index].pair = dp; + dst->score = score; } /* @@ -310,7 +301,7 @@ static int find_identical_files(struct hashmap *srcs, struct diff_options *options) { int renames = 0; - struct diff_filespec *target = rename_dst[dst_index].two; + struct diff_filespec *target = rename_dst[dst_index].p->two; struct file_similarity *p, *best = NULL; int i = 100, best_score = -1; unsigned int hash = hash_filespec(options->repo, target); @@ -476,7 +467,7 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i (mx[i].score < minimum_score)) break; /* there is no more usable pair. */ dst = &rename_dst[mx[i].dst]; - if (dst->pair) + if (dst->is_rename) continue; /* already done, either exact or fuzzy. */ if (!copies && rename_src[mx[i].src].p->one->rename_used) continue; @@ -511,7 +502,7 @@ void diffcore_rename(struct diff_options *options) else if (!options->flags.rename_empty && is_empty_blob_oid(&p->two->oid)) continue; - else if (add_rename_dst(p->two) < 0) { + else if (add_rename_dst(p) < 0) { warning("skipping rename detection, detected" " duplicate destination '%s'", p->two->path); @@ -586,10 +577,10 @@ void diffcore_rename(struct diff_options *options) mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations), sizeof(*mx)); for (dst_cnt = i = 0; i < rename_dst_nr; i++) { - struct diff_filespec *two = rename_dst[i].two; + struct diff_filespec *two = rename_dst[i].p->two; struct diff_score *m; - if (rename_dst[i].pair) + if (rename_dst[i].is_rename) continue; /* dealt with exact match already. */ m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST]; @@ -646,22 +637,8 @@ void diffcore_rename(struct diff_options *options) diff_q(&outq, p); } else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { - /* - * Creation - * - * We would output this create record if it has - * not been turned into a rename/copy already. - */ - struct diff_rename_dst *dst = locate_rename_dst(p->two); - if (dst && dst->pair) { - diff_q(&outq, dst->pair); - pair_to_free = p; - } - else - /* no matching rename/copy source, so - * record this as a creation. - */ - diff_q(&outq, p); + /* Creation */ + diff_q(&outq, p); } else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) { /* @@ -682,8 +659,10 @@ void diffcore_rename(struct diff_options *options) */ if (DIFF_PAIR_BROKEN(p)) { /* broken delete */ - struct diff_rename_dst *dst = locate_rename_dst(p->one); - if (dst && dst->pair) + struct diff_rename_dst *dst = locate_rename_dst(p); + if (!dst) + BUG("tracking failed somehow; failed to find associated dst for broken pair"); + if (dst->is_rename) /* counterpart is now rename/copy */ pair_to_free = p; } @@ -693,16 +672,14 @@ void diffcore_rename(struct diff_options *options) pair_to_free = p; } - if (pair_to_free) - ; - else + if (!pair_to_free) diff_q(&outq, p); } else if (!diff_unmodified_pair(p)) /* all the usual ones need to be kept */ diff_q(&outq, p); else - /* no need to keep unmodified pairs */ + /* no need to keep unmodified pairs; FIXME: remove earlier? */ pair_to_free = p; if (pair_to_free) @@ -715,11 +692,16 @@ void diffcore_rename(struct diff_options *options) diff_debug_queue("done collapsing", q); for (i = 0; i < rename_dst_nr; i++) - free_filespec(rename_dst[i].two); + if (rename_dst[i].filespec_to_free) + free_filespec(rename_dst[i].filespec_to_free); FREE_AND_NULL(rename_dst); rename_dst_nr = rename_dst_alloc = 0; FREE_AND_NULL(rename_src); rename_src_nr = rename_src_alloc = 0; + if (break_idx) { + strintmap_clear(break_idx); + FREE_AND_NULL(break_idx); + } return; } From 350410f6b13557df71579e2d121c31135ff1cf86 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 29 Dec 2020 20:05:28 +0000 Subject: [PATCH 9/9] diffcore-rename: remove unnecessary duplicate entry checks Commit 25d5ea410f ("[PATCH] Redo rename/copy detection logic.", 2005-05-24) added a duplicate entry check on rename_src in order to avoid segfaults; the code at the time was prone to double free()s and an easy way to avoid it was just to turn off rename detection for any duplicate entries. Note that the form of the check was modified two commits ago in this series. Similarly, commit 4d6be03b95 ("diffcore-rename: avoid processing duplicate destinations", 2015-02-26) added a duplicate entry check on rename_dst for the exact same reason -- the code was prone to double free()s, and an easy way to avoid it was just to turn off rename detection entirely. Note that the form of the check was modified in the commit just before this one. In the original code in both places, the code was dealing with individual diff_filespecs and trying to match things up, instead of just keeping the original diff_filepairs around as we do now. The intervening change in structure has fixed the accounting problems and the associated double free()s that used to occur, and thus we already have a better fix. As such, we can remove the band-aid checks for duplicate entries. Due to the last two patches, the diffcore_rename() setup is no longer a sizeable chunk of overall runtime. Thus, in a large rebase of many commits with lots of renames and several optimizations to inexact rename detection, this patch only speeds up the overall code by about half a percent or so and is pretty close to the run-to-run variability making it hard to get an exact measurement. However, with some trace2 regions around the setup code in diffcore_rename() so that I can focus on just it, I measure that this patch consistently saves almost a third of the remaining time spent in diffcore_rename() setup. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 23 ----------------------- t/t4058-diff-duplicates.sh | 2 +- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 2a8fcd5292..90db9ebd6d 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -34,18 +34,6 @@ static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p) */ static int add_rename_dst(struct diff_filepair *p) { - /* - * If we have multiple entries at the same path in the destination - * tree (an invalid tree, to be sure), turn off rename detection - * entirely. Once upon a time, diffcore-rename had double free() - * issues due to such duplicate paths, resulting in segfaults. See - * commit 4d6be03b95 ("diffcore-rename: avoid processing duplicate - * destinations", 2015-02-26). - */ - if (rename_dst_nr > 0 && - !strcmp(rename_dst[rename_dst_nr-1].p->two->path, p->two->path)) - return -1; - ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); rename_dst[rename_dst_nr].p = p; rename_dst[rename_dst_nr].filespec_to_free = NULL; @@ -63,17 +51,6 @@ static int rename_src_nr, rename_src_alloc; static void register_rename_src(struct diff_filepair *p) { - /* - * If we have multiple entries at the same path in the source tree - * (an invalid tree, to be sure), avoid using more more than one - * such entry in rename detection. Once upon a time, doing so - * caused segfaults; see commit 25d5ea410f ("[PATCH] Redo - * rename/copy detection logic.", 2005-05-24). - */ - if (rename_src_nr > 0 && - !strcmp(rename_src[rename_src_nr-1].p->one->path, p->one->path)) - return; - if (p->broken_pair) { if (!break_idx) { break_idx = xmalloc(sizeof(*break_idx)); diff --git a/t/t4058-diff-duplicates.sh b/t/t4058-diff-duplicates.sh index ad3f37d388..54614b814d 100755 --- a/t/t4058-diff-duplicates.sh +++ b/t/t4058-diff-duplicates.sh @@ -91,7 +91,7 @@ test_expect_success 'diff-tree between duplicate trees' ' test_expect_success 'diff-tree with renames' ' # See NOTICE at top of file. git diff-tree -M -r --no-abbrev one two >actual && - test_cmp expect actual + test_must_be_empty actual ' test_expect_success 'diff-tree FROM duplicate tree' '