Merge branch 'ds/cache-tree-basics'

Document, clean-up and optimize the code around the cache-tree
extension in the index.

* ds/cache-tree-basics:
  cache-tree: speed up consecutive path comparisons
  cache-tree: use ce_namelen() instead of strlen()
  index-format: discuss recursion of cache-tree better
  index-format: update preamble to cache tree extension
  index-format: use 'cache tree' over 'cached tree'
  cache-tree: trace regions for prime_cache_tree
  cache-tree: trace regions for I/O
  cache-tree: use trace2 in cache_tree_update()
  unpack-trees: add trace2 regions
  tree-walk: report recursion counts
This commit is contained in:
Junio C Hamano 2021-02-05 16:40:44 -08:00
commit a0a2d75d3b
5 changed files with 92 additions and 16 deletions

View file

@ -26,7 +26,7 @@ Git index format
Extensions are identified by signature. Optional extensions can
be ignored if Git does not understand them.
Git currently supports cached tree and resolve undo extensions.
Git currently supports cache tree and resolve undo extensions.
4-byte extension signature. If the first byte is 'A'..'Z' the
extension is optional and can be ignored.
@ -136,14 +136,35 @@ Git index format
== Extensions
=== Cached tree
=== Cache tree
Cached tree extension contains pre-computed hashes for trees that can
be derived from the index. It helps speed up tree object generation
from index for a new commit.
Since the index does not record entries for directories, the cache
entries cannot describe tree objects that already exist in the object
database for regions of the index that are unchanged from an existing
commit. The cache tree extension stores a recursive tree structure that
describes the trees that already exist and completely match sections of
the cache entries. This speeds up tree object generation from the index
for a new commit by only computing the trees that are "new" to that
commit. It also assists when comparing the index to another tree, such
as `HEAD^{tree}`, since sections of the index can be skipped when a tree
comparison demonstrates equality.
When a path is updated in index, the path must be invalidated and
removed from tree cache.
The recursive tree structure uses nodes that store a number of cache
entries, a list of subnodes, and an object ID (OID). The OID references
the existing tree for that node, if it is known to exist. The subnodes
correspond to subdirectories that themselves have cache tree nodes. The
number of cache entries corresponds to the number of cache entries in
the index that describe paths within that tree's directory.
The extension tracks the full directory structure in the cache tree
extension, but this is generally smaller than the full cache entry list.
When a path is updated in index, Git invalidates all nodes of the
recursive cache tree corresponding to the parent directories of that
path. We store these tree nodes as being "invalid" by using "-1" as the
number of cache entries. Invalid nodes still store a span of index
entries, allowing Git to focus its efforts when reconstructing a full
cache tree.
The signature for this extension is { 'T', 'R', 'E', 'E' }.
@ -174,7 +195,8 @@ Git index format
first entry represents the root level of the repository, followed by the
first subtree--let's call this A--of the root level (with its name
relative to the root level), followed by the first subtree of A (with
its name relative to A), ...
its name relative to A), and so on. The specified number of subtrees
indicates when the current level of the recursive stack is complete.
=== Resolve undo

View file

@ -185,12 +185,14 @@ static int verify_cache(struct cache_entry **cache,
* the cache is sorted. Also path can appear only once,
* which means conflicting one would immediately follow.
*/
const char *this_name = cache[i]->name;
const char *next_name = cache[i+1]->name;
int this_len = strlen(this_name);
if (this_len < strlen(next_name) &&
strncmp(this_name, next_name, this_len) == 0 &&
next_name[this_len] == '/') {
const struct cache_entry *this_ce = cache[i];
const struct cache_entry *next_ce = cache[i + 1];
const char *this_name = this_ce->name;
const char *next_name = next_ce->name;
int this_len = ce_namelen(this_ce);
if (this_len < ce_namelen(next_ce) &&
next_name[this_len] == '/' &&
strncmp(this_name, next_name, this_len) == 0) {
if (10 < ++funny) {
fprintf(stderr, "...\n");
break;
@ -442,7 +444,9 @@ int cache_tree_update(struct index_state *istate, int flags)
if (i)
return i;
trace_performance_enter();
trace2_region_enter("cache_tree", "update", the_repository);
i = update_one(it, cache, entries, "", 0, &skip, flags);
trace2_region_leave("cache_tree", "update", the_repository);
trace_performance_leave("cache_tree_update");
if (i < 0)
return i;
@ -492,7 +496,9 @@ static void write_one(struct strbuf *buffer, struct cache_tree *it,
void cache_tree_write(struct strbuf *sb, struct cache_tree *root)
{
trace2_region_enter("cache_tree", "write", the_repository);
write_one(sb, root, "", 0);
trace2_region_leave("cache_tree", "write", the_repository);
}
static struct cache_tree *read_one(const char **buffer, unsigned long *size_p)
@ -581,9 +587,16 @@ static struct cache_tree *read_one(const char **buffer, unsigned long *size_p)
struct cache_tree *cache_tree_read(const char *buffer, unsigned long size)
{
struct cache_tree *result;
if (buffer[0])
return NULL; /* not the whole tree */
return read_one(&buffer, &size);
trace2_region_enter("cache_tree", "read", the_repository);
result = read_one(&buffer, &size);
trace2_region_leave("cache_tree", "read", the_repository);
return result;
}
static struct cache_tree *cache_tree_find(struct cache_tree *it, const char *path)
@ -733,10 +746,13 @@ void prime_cache_tree(struct repository *r,
struct index_state *istate,
struct tree *tree)
{
trace2_region_enter("cache-tree", "prime_cache_tree", the_repository);
cache_tree_free(&istate->cache_tree);
istate->cache_tree = cache_tree();
prime_cache_tree_rec(r, istate->cache_tree, tree);
istate->cache_changed |= CACHE_TREE_CHANGED;
trace2_region_leave("cache-tree", "prime_cache_tree", the_repository);
}
/*

View file

@ -33,7 +33,7 @@ test_expect_success 'reset --hard should restore unmerged ones' '
'
test_expect_success 'reset --hard did not corrupt index or cached-tree' '
test_expect_success 'reset --hard did not corrupt index or cache-tree' '
T=$(git write-tree) &&
rm -f .git/index &&

View file

@ -4,6 +4,7 @@
#include "object-store.h"
#include "tree.h"
#include "pathspec.h"
#include "json-writer.h"
static const char *get_mode(const char *str, unsigned int *modep)
{
@ -167,6 +168,25 @@ int tree_entry_gently(struct tree_desc *desc, struct name_entry *entry)
return 1;
}
static int traverse_trees_atexit_registered;
static int traverse_trees_count;
static int traverse_trees_cur_depth;
static int traverse_trees_max_depth;
static void trace2_traverse_trees_statistics_atexit(void)
{
struct json_writer jw = JSON_WRITER_INIT;
jw_object_begin(&jw, 0);
jw_object_intmax(&jw, "traverse_trees_count", traverse_trees_count);
jw_object_intmax(&jw, "traverse_trees_max_depth", traverse_trees_max_depth);
jw_end(&jw);
trace2_data_json("traverse_trees", the_repository, "statistics", &jw);
jw_release(&jw);
}
void setup_traverse_info(struct traverse_info *info, const char *base)
{
size_t pathlen = strlen(base);
@ -180,6 +200,11 @@ void setup_traverse_info(struct traverse_info *info, const char *base)
info->namelen = pathlen;
if (pathlen)
info->prev = &dummy;
if (trace2_is_enabled() && !traverse_trees_atexit_registered) {
atexit(trace2_traverse_trees_statistics_atexit);
traverse_trees_atexit_registered = 1;
}
}
char *make_traverse_path(char *path, size_t pathlen,
@ -416,6 +441,12 @@ int traverse_trees(struct index_state *istate,
int interesting = 1;
char *traverse_path;
traverse_trees_count++;
traverse_trees_cur_depth++;
if (traverse_trees_cur_depth > traverse_trees_max_depth)
traverse_trees_max_depth = traverse_trees_cur_depth;
if (n >= ARRAY_SIZE(entry))
BUG("traverse_trees() called with too many trees (%d)", n);
@ -515,6 +546,8 @@ int traverse_trees(struct index_state *istate,
free(traverse_path);
info->traverse_path = NULL;
strbuf_release(&base);
traverse_trees_cur_depth--;
return error;
}

View file

@ -1580,6 +1580,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
die("unpack_trees takes at most %d trees", MAX_UNPACK_TREES);
trace_performance_enter();
trace2_region_enter("unpack_trees", "unpack_trees", the_repository);
if (!core_apply_sparse_checkout || !o->update)
o->skip_sparse_checkout = 1;
if (!o->skip_sparse_checkout && !o->pl) {
@ -1653,7 +1655,9 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
}
trace_performance_enter();
trace2_region_enter("unpack_trees", "traverse_trees", the_repository);
ret = traverse_trees(o->src_index, len, t, &info);
trace2_region_leave("unpack_trees", "traverse_trees", the_repository);
trace_performance_leave("traverse_trees");
if (ret < 0)
goto return_failed;
@ -1741,6 +1745,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
done:
if (free_pattern_list)
clear_pattern_list(&pl);
trace2_region_leave("unpack_trees", "unpack_trees", the_repository);
trace_performance_leave("unpack_trees");
return ret;