[PATCH] Optimize diff-tree -[CM] --stdin

This attempts to optimize "diff-tree -[CM] --stdin", which
compares successible tree pairs.  This optimization does not
make much sense for other commands in the diff-* brothers.

When reading from --stdin and using rename/copy detection, the
patch makes diff-tree to read the current index file first.
This is done to reuse the optimization used by diff-cache in the
non-cached case.  Similarity estimator can avoid expanding a
blob if the index says what is in the work tree has an exact
copy of that blob already expanded.

Another optimization the patch makes is to check only file sizes
first to terminate similarity estimation early.  In order for
this to work, it needs a way to tell the size of the blob
without expanding it.  Since an obvious way of doing it, which
is to keep all the blobs previously used in the memory, is too
costly, it does so by keeping the filesize for each object it
has already seen in memory.

Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Junio C Hamano 2005-05-27 15:56:38 -07:00 committed by Linus Torvalds
parent 6145ee8b36
commit f0c6b2a2fd
6 changed files with 102 additions and 9 deletions

View file

@ -578,6 +578,9 @@ int main(int argc, const char **argv)
if (!read_stdin)
return 0;
if (detect_rename)
diff_setup_opt |= (DIFF_SETUP_USE_SIZE_CACHE |
DIFF_SETUP_USE_CACHE);
while (fgets(line, sizeof(line), stdin))
diff_tree_stdin(line);

83
diff.c
View file

@ -12,6 +12,7 @@ static const char *diff_opts = "-pu";
static unsigned char null_sha1[20] = { 0, };
static int reverse_diff;
static int use_size_cache;
static const char *external_diff(void)
{
@ -222,12 +223,60 @@ static int work_tree_matches(const char *name, const unsigned char *sha1)
return 1;
}
static struct sha1_size_cache {
unsigned char sha1[20];
unsigned long size;
} **sha1_size_cache;
static int sha1_size_cache_nr, sha1_size_cache_alloc;
static struct sha1_size_cache *locate_size_cache(unsigned char *sha1,
unsigned long size)
{
int first, last;
struct sha1_size_cache *e;
first = 0;
last = sha1_size_cache_nr;
while (last > first) {
int next = (last + first) >> 1;
e = sha1_size_cache[next];
int cmp = memcmp(e->sha1, sha1, 20);
if (!cmp)
return e;
if (cmp < 0) {
last = next;
continue;
}
first = next+1;
}
/* not found */
if (size == UINT_MAX)
return NULL;
/* insert to make it at "first" */
if (sha1_size_cache_alloc <= sha1_size_cache_nr) {
sha1_size_cache_alloc = alloc_nr(sha1_size_cache_alloc);
sha1_size_cache = xrealloc(sha1_size_cache,
sha1_size_cache_alloc *
sizeof(*sha1_size_cache));
}
sha1_size_cache_nr++;
if (first < sha1_size_cache_nr)
memmove(sha1_size_cache + first + 1, sha1_size_cache + first,
(sha1_size_cache_nr - first - 1) *
sizeof(*sha1_size_cache));
e = xmalloc(sizeof(struct sha1_size_cache));
sha1_size_cache[first] = e;
memcpy(e->sha1, sha1, 20);
e->size = size;
return e;
}
/*
* While doing rename detection and pickaxe operation, we may need to
* grab the data for the blob (or file) for our own in-core comparison.
* diff_filespec has data and size fields for this purpose.
*/
int diff_populate_filespec(struct diff_filespec *s)
int diff_populate_filespec(struct diff_filespec *s, int size_only)
{
int err = 0;
if (!DIFF_FILE_VALID(s))
@ -235,6 +284,9 @@ int diff_populate_filespec(struct diff_filespec *s)
if (S_ISDIR(s->mode))
return -1;
if (!use_size_cache)
size_only = 0;
if (s->data)
return err;
if (!s->sha1_valid ||
@ -254,6 +306,8 @@ int diff_populate_filespec(struct diff_filespec *s)
s->size = st.st_size;
if (!s->size)
goto empty;
if (size_only)
return 0;
if (S_ISLNK(st.st_mode)) {
int ret;
s->data = xmalloc(s->size);
@ -273,9 +327,21 @@ int diff_populate_filespec(struct diff_filespec *s)
close(fd);
}
else {
/* We cannot do size only for SHA1 blobs */
char type[20];
struct sha1_size_cache *e;
if (size_only) {
e = locate_size_cache(s->sha1, UINT_MAX);
if (e) {
s->size = e->size;
return 0;
}
}
s->data = read_sha1_file(s->sha1, type, &s->size);
s->should_free = 1;
if (s->data && size_only)
locate_size_cache(s->sha1, s->size);
}
return 0;
}
@ -361,7 +427,7 @@ static void prepare_temp_file(const char *name,
return;
}
else {
if (diff_populate_filespec(one))
if (diff_populate_filespec(one, 0))
die("cannot read data blob for %s", one->path);
prep_temp_blob(temp, one->data, one->size,
one->sha1, one->mode);
@ -496,6 +562,19 @@ void diff_setup(int flags)
{
if (flags & DIFF_SETUP_REVERSE)
reverse_diff = 1;
if (flags & DIFF_SETUP_USE_CACHE) {
if (!active_cache)
/* read-cache does not die even when it fails
* so it is safe for us to do this here. Also
* it does not smudge active_cache or active_nr
* when it fails, so we do not have to worry about
* cleaning it up oufselves either.
*/
read_cache();
}
if (flags & DIFF_SETUP_USE_SIZE_CACHE)
use_size_cache = 1;
}
struct diff_queue_struct diff_queued_diff;

2
diff.h
View file

@ -29,6 +29,8 @@ extern void diff_unmerge(const char *path);
extern int diff_scoreopt_parse(const char *opt);
#define DIFF_SETUP_REVERSE 1
#define DIFF_SETUP_USE_CACHE 2
#define DIFF_SETUP_USE_SIZE_CACHE 4
extern void diff_setup(int flags);
#define DIFF_DETECT_RENAME 1

View file

@ -11,7 +11,7 @@ static int contains(struct diff_filespec *one,
{
unsigned long offset, sz;
const char *data;
if (diff_populate_filespec(one))
if (diff_populate_filespec(one, 0))
return 0;
sz = one->size;
data = one->data;

View file

@ -99,8 +99,11 @@ static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)
if (src->sha1_valid && dst->sha1_valid &&
!memcmp(src->sha1, dst->sha1, 20))
return 1;
if (diff_populate_filespec(src) || diff_populate_filespec(dst))
/* this is an error but will be caught downstream */
if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
return 0;
if (src->size != dst->size)
return 0;
if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
return 0;
if (src->size == dst->size &&
!memcmp(src->data, dst->data, src->size))
@ -125,9 +128,11 @@ static int estimate_similarity(struct diff_filespec *src,
* dst, and then some edit has been applied to dst.
*
* Compare them and return how similar they are, representing
* the score as an integer between 0 and 10000, except
* where they match exactly it is considered better than anything
* else.
* the score as an integer between 0 and MAX_SCORE.
*
* When there is an exact match, it is considered a better
* match than anything else; the destination does not even
* call into this function in that case.
*/
void *delta;
unsigned long delta_size, base_size;
@ -147,6 +152,7 @@ static int estimate_similarity(struct diff_filespec *src,
/* We would not consider edits that change the file size so
* drastically. delta_size must be smaller than
* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
*
* Note that base_size == 0 case is handled here already
* and the final score computation below would not have a
* divide-by-zero issue.
@ -154,6 +160,9 @@ static int estimate_similarity(struct diff_filespec *src,
if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
return 0;
if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
return 0; /* error but caught downstream */
delta = diff_delta(src->data, src->size,
dst->data, dst->size,
&delta_size);

View file

@ -33,7 +33,7 @@ extern struct diff_filespec *alloc_filespec(const char *);
extern void fill_filespec(struct diff_filespec *, const unsigned char *,
unsigned short);
extern int diff_populate_filespec(struct diff_filespec *);
extern int diff_populate_filespec(struct diff_filespec *, int);
extern void diff_free_filespec_data(struct diff_filespec *);
struct diff_filepair {