git/t/t5325-reverse-index.sh

209 lines
5.1 KiB
Bash
Raw Normal View History

#!/bin/sh
test_description='on-disk reverse index'
TEST_PASSES_SANITIZE_LEAK=true
. ./test-lib.sh
# The below tests want control over the 'pack.writeReverseIndex' setting
# themselves to assert various combinations of it with other options.
sane_unset GIT_TEST_NO_WRITE_REV_INDEX
packdir=.git/objects/pack
test_expect_success 'setup' '
test_commit base &&
config: enable `pack.writeReverseIndex` by default Back in e37d0b8730 (builtin/index-pack.c: write reverse indexes, 2021-01-25), Git learned how to read and write a pack's reverse index from a file instead of in-memory. A pack's reverse index is a mapping from pack position (that is, the order that objects appear together in a ".pack") to their position in lexical order (that is, the order that objects are listed in an ".idx" file). Reverse indexes are consulted often during pack-objects, as well as during auxiliary operations that require mapping between pack offsets, pack order, and index index. They are useful in GitHub's infrastructure, where we have seen a dramatic increase in performance when writing ".rev" files[1]. In particular: - an ~80% reduction in the time it takes to serve fetches on a popular repository, Homebrew/homebrew-core. - a ~60% reduction in the peak memory usage to serve fetches on that same repository. - a collective savings of ~35% in CPU time across all pack-objects invocations serving fetches across all repositories in a single datacenter. Reverse indexes are also beneficial to end-users as well as forges. For example, the time it takes to generate a pack containing the objects for the 10 most recent commits in linux.git (representing a typical push) is significantly faster when on-disk reverse indexes are available: $ { git rev-parse HEAD && printf '^' && git rev-parse HEAD~10 } >in $ hyperfine -L v false,true 'git.compile -c pack.readReverseIndex={v} pack-objects --delta-base-offset --revs --stdout <in >/dev/null' Benchmark 1: git.compile -c pack.readReverseIndex=false pack-objects --delta-base-offset --revs --stdout <in >/dev/null Time (mean ± σ): 543.0 ms ± 20.3 ms [User: 616.2 ms, System: 58.8 ms] Range (min … max): 521.0 ms … 577.9 ms 10 runs Benchmark 2: git.compile -c pack.readReverseIndex=true pack-objects --delta-base-offset --revs --stdout <in >/dev/null Time (mean ± σ): 245.0 ms ± 11.4 ms [User: 335.6 ms, System: 31.3 ms] Range (min … max): 226.0 ms … 259.6 ms 13 runs Summary 'git.compile -c pack.readReverseIndex=true pack-objects --delta-base-offset --revs --stdout <in >/dev/null' ran 2.22 ± 0.13 times faster than 'git.compile -c pack.readReverseIndex=false pack-objects --delta-base-offset --revs --stdout <in >/dev/null' The same is true of writing a pack containing the objects for the 30 most-recent commits: $ { git rev-parse HEAD && printf '^' && git rev-parse HEAD~30 } >in $ hyperfine -L v false,true 'git.compile -c pack.readReverseIndex={v} pack-objects --delta-base-offset --revs --stdout <in >/dev/null' Benchmark 1: git.compile -c pack.readReverseIndex=false pack-objects --delta-base-offset --revs --stdout <in >/dev/null Time (mean ± σ): 866.5 ms ± 16.2 ms [User: 1414.5 ms, System: 97.0 ms] Range (min … max): 839.3 ms … 886.9 ms 10 runs Benchmark 2: git.compile -c pack.readReverseIndex=true pack-objects --delta-base-offset --revs --stdout <in >/dev/null Time (mean ± σ): 581.6 ms ± 10.2 ms [User: 1181.7 ms, System: 62.6 ms] Range (min … max): 567.5 ms … 599.3 ms 10 runs Summary 'git.compile -c pack.readReverseIndex=true pack-objects --delta-base-offset --revs --stdout <in >/dev/null' ran 1.49 ± 0.04 times faster than 'git.compile -c pack.readReverseIndex=false pack-objects --delta-base-offset --revs --stdout <in >/dev/null' ...and savings on trivial operations like computing the on-disk size of a single (packed) object are even more dramatic: $ git rev-parse HEAD >in $ hyperfine -L v false,true 'git.compile -c pack.readReverseIndex={v} cat-file --batch-check="%(objectsize:disk)" <in' Benchmark 1: git.compile -c pack.readReverseIndex=false cat-file --batch-check="%(objectsize:disk)" <in Time (mean ± σ): 305.8 ms ± 11.4 ms [User: 264.2 ms, System: 41.4 ms] Range (min … max): 290.3 ms … 331.1 ms 10 runs Benchmark 2: git.compile -c pack.readReverseIndex=true cat-file --batch-check="%(objectsize:disk)" <in Time (mean ± σ): 4.0 ms ± 0.3 ms [User: 1.7 ms, System: 2.3 ms] Range (min … max): 1.6 ms … 4.6 ms 1155 runs Summary 'git.compile -c pack.readReverseIndex=true cat-file --batch-check="%(objectsize:disk)" <in' ran 76.96 ± 6.25 times faster than 'git.compile -c pack.readReverseIndex=false cat-file --batch-check="%(objectsize:disk)" <in' In the more than two years since e37d0b8730 was merged, Git's implementation of on-disk reverse indexes has been thoroughly tested, both from users enabling `pack.writeReverseIndexes`, and from GitHub's deployment of the feature. The latter has been running without incident for more than two years. This patch changes Git's behavior to write on-disk reverse indexes by default when indexing a pack, which should make the above operations faster for everybody's Git installation after a repack. (The previous commit explains some potential drawbacks of using on-disk reverse indexes in certain limited circumstances, that essentially boil down to a trade-off between time to generate, and time to access. For those limited cases, the `pack.readReverseIndex` escape hatch can be used). [1]: https://github.blog/2021-04-29-scaling-monorepo-maintenance/#reverse-indexes Signed-off-by: Taylor Blau <me@ttaylorr.com> Acked-by: Derrick Stolee <derrickstolee@github.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-12 22:20:33 +00:00
test_config pack.writeReverseIndex false &&
pack=$(git pack-objects --all $packdir/pack) &&
rev=$packdir/pack-$pack.rev &&
test_path_is_missing $rev
'
test_index_pack () {
rm -f $rev &&
conf=$1 &&
shift &&
# remove the index since Windows won't overwrite an existing file
rm $packdir/pack-$pack.idx &&
git -c pack.writeReverseIndex=$conf index-pack "$@" \
$packdir/pack-$pack.pack
}
test_expect_success 'index-pack with pack.writeReverseIndex' '
test_index_pack "" &&
test_path_is_missing $rev &&
test_index_pack false &&
test_path_is_missing $rev &&
test_index_pack true &&
test_path_is_file $rev
'
test_expect_success 'index-pack with --[no-]rev-index' '
for conf in "" true false
do
test_index_pack "$conf" --rev-index &&
test_path_exists $rev &&
test_index_pack "$conf" --no-rev-index &&
test_path_is_missing $rev || return 1
done
'
test_expect_success 'index-pack can verify reverse indexes' '
test_when_finished "rm -f $rev" &&
test_index_pack true &&
test_path_is_file $rev &&
git index-pack --rev-index --verify $packdir/pack-$pack.pack &&
# Intentionally corrupt the reverse index.
chmod u+w $rev &&
printf "xxxx" | dd of=$rev bs=1 count=4 conv=notrunc &&
test_must_fail git index-pack --rev-index --verify \
$packdir/pack-$pack.pack 2>err &&
grep "validation error" err
'
test_expect_success 'index-pack infers reverse index name with -o' '
git index-pack --rev-index -o other.idx $packdir/pack-$pack.pack &&
test_path_is_file other.idx &&
test_path_is_file other.rev
'
test_expect_success 'pack-objects respects pack.writeReverseIndex' '
test_when_finished "rm -fr pack-1-*" &&
git -c pack.writeReverseIndex= pack-objects --all pack-1 &&
test_path_is_missing pack-1-*.rev &&
git -c pack.writeReverseIndex=false pack-objects --all pack-1 &&
test_path_is_missing pack-1-*.rev &&
git -c pack.writeReverseIndex=true pack-objects --all pack-1 &&
test_path_is_file pack-1-*.rev
'
test_expect_success 'reverse index is not generated when available on disk' '
test_index_pack true &&
test_path_is_file $rev &&
git rev-parse HEAD >tip &&
GIT_TEST_REV_INDEX_DIE_IN_MEMORY=1 git cat-file \
--batch-check="%(objectsize:disk)" <tip
'
pack-revindex: introduce `pack.readReverseIndex` Since 1615c567b8 (Documentation/config/pack.txt: advertise 'pack.writeReverseIndex', 2021-01-25), we have had the `pack.writeReverseIndex` configuration option, which tells Git whether or not it is allowed to write a ".rev" file when indexing a pack. Introduce a complementary configuration knob, `pack.readReverseIndex` to control whether or not Git will read any ".rev" file(s) that may be available on disk. This option is useful for debugging, as well as disabling the effect of ".rev" files in certain instances. This is useful because of the trade-off[^1] between the time it takes to generate a reverse index (slow from scratch, fast when reading an existing ".rev" file), and the time it takes to access a record (the opposite). For example, even though it is faster to use the on-disk reverse index when computing the on-disk size of a packed object, it is slower to enumerate the same value for all objects. Here are a couple of examples from linux.git. When computing the above for a single object, using the on-disk reverse index is significantly faster: $ git rev-parse HEAD >in $ hyperfine -L v false,true 'git.compile -c pack.readReverseIndex={v} cat-file --batch-check="%(objectsize:disk)" <in' Benchmark 1: git.compile -c pack.readReverseIndex=false cat-file --batch-check="%(objectsize:disk)" <in Time (mean ± σ): 302.5 ms ± 12.5 ms [User: 258.7 ms, System: 43.6 ms] Range (min … max): 291.1 ms … 328.1 ms 10 runs Benchmark 2: git.compile -c pack.readReverseIndex=true cat-file --batch-check="%(objectsize:disk)" <in Time (mean ± σ): 3.9 ms ± 0.3 ms [User: 1.6 ms, System: 2.4 ms] Range (min … max): 2.0 ms … 4.4 ms 801 runs Summary 'git.compile -c pack.readReverseIndex=true cat-file --batch-check="%(objectsize:disk)" <in' ran 77.29 ± 7.14 times faster than 'git.compile -c pack.readReverseIndex=false cat-file --batch-check="%(objectsize:disk)" <in' , but when instead trying to compute the on-disk object size for all objects in the repository, using the ".rev" file is a disadvantage over creating the reverse index from scratch: $ hyperfine -L v false,true 'git.compile -c pack.readReverseIndex={v} cat-file --batch-check="%(objectsize:disk)" --batch-all-objects' Benchmark 1: git.compile -c pack.readReverseIndex=false cat-file --batch-check="%(objectsize:disk)" --batch-all-objects Time (mean ± σ): 8.258 s ± 0.035 s [User: 7.949 s, System: 0.308 s] Range (min … max): 8.199 s … 8.293 s 10 runs Benchmark 2: git.compile -c pack.readReverseIndex=true cat-file --batch-check="%(objectsize:disk)" --batch-all-objects Time (mean ± σ): 16.976 s ± 0.107 s [User: 16.706 s, System: 0.268 s] Range (min … max): 16.839 s … 17.105 s 10 runs Summary 'git.compile -c pack.readReverseIndex=false cat-file --batch-check="%(objectsize:disk)" --batch-all-objects' ran 2.06 ± 0.02 times faster than 'git.compile -c pack.readReverseIndex=true cat-file --batch-check="%(objectsize:disk)" --batch-all-objects' Luckily, the results when running `git cat-file` with `--unordered` are closer together: $ hyperfine -L v false,true 'git.compile -c pack.readReverseIndex={v} cat-file --unordered --batch-check="%(objectsize:disk)" --batch-all-objects' Benchmark 1: git.compile -c pack.readReverseIndex=false cat-file --unordered --batch-check="%(objectsize:disk)" --batch-all-objects Time (mean ± σ): 5.066 s ± 0.105 s [User: 4.792 s, System: 0.274 s] Range (min … max): 4.943 s … 5.220 s 10 runs Benchmark 2: git.compile -c pack.readReverseIndex=true cat-file --unordered --batch-check="%(objectsize:disk)" --batch-all-objects Time (mean ± σ): 6.193 s ± 0.069 s [User: 5.937 s, System: 0.255 s] Range (min … max): 6.145 s … 6.356 s 10 runs Summary 'git.compile -c pack.readReverseIndex=false cat-file --unordered --batch-check="%(objectsize:disk)" --batch-all-objects' ran 1.22 ± 0.03 times faster than 'git.compile -c pack.readReverseIndex=true cat-file --unordered --batch-check="%(objectsize:disk)" --batch-all-objects' Because the equilibrium point between these two is highly machine- and repository-dependent, allow users to configure whether or not they will read any ".rev" file(s) with this configuration knob. [^1]: Generating a reverse index in memory takes O(N) time (where N is the number of objects in the repository), since we use a radix sort. Reading an entry from an on-disk ".rev" file is slower since each operation is bound by disk I/O instead of memory I/O. In order to compute the on-disk size of a packed object, we need to find the offset of our object, and the adjacent object (the on-disk size difference of these two). Finding the first offset requires a binary search. Finding the latter involves a single .rev lookup. Signed-off-by: Taylor Blau <me@ttaylorr.com> Acked-by: Derrick Stolee <derrickstolee@github.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-12 22:20:30 +00:00
test_expect_success 'reverse index is ignored when pack.readReverseIndex is false' '
test_index_pack true &&
test_path_is_file $rev &&
test_config pack.readReverseIndex false &&
git rev-parse HEAD >tip &&
GIT_TEST_REV_INDEX_DIE_ON_DISK=1 git cat-file \
--batch-check="%(objectsize:disk)" <tip
'
test_expect_success 'revindex in-memory vs on-disk' '
git init repo &&
test_when_finished "rm -fr repo" &&
(
cd repo &&
test_commit commit &&
git rev-list --objects --no-object-names --all >objects &&
git -c pack.writeReverseIndex=false repack -ad &&
test_path_is_missing $packdir/pack-*.rev &&
git cat-file --batch-check="%(objectsize:disk) %(objectname)" \
<objects >in-core &&
git -c pack.writeReverseIndex=true repack -ad &&
test_path_is_file $packdir/pack-*.rev &&
git cat-file --batch-check="%(objectsize:disk) %(objectname)" \
<objects >on-disk &&
test_cmp on-disk in-core
)
'
test_expect_success 'fsck succeeds on good rev-index' '
test_when_finished rm -fr repo &&
git init repo &&
(
cd repo &&
test_commit commit &&
git -c pack.writeReverseIndex=true repack -ad &&
git fsck 2>err &&
test_must_be_empty err
)
'
test_expect_success 'set up rev-index corruption tests' '
git init corrupt &&
(
cd corrupt &&
test_commit commit &&
git -c pack.writeReverseIndex=true repack -ad &&
revfile=$(ls .git/objects/pack/pack-*.rev) &&
chmod a+w $revfile &&
cp $revfile $revfile.bak
)
'
corrupt_rev_and_verify () {
(
pos="$1" &&
value="$2" &&
error="$3" &&
cd corrupt &&
revfile=$(ls .git/objects/pack/pack-*.rev) &&
# Reset to original rev-file.
cp $revfile.bak $revfile &&
printf "$value" | dd of=$revfile bs=1 seek="$pos" conv=notrunc &&
test_must_fail git fsck 2>err &&
grep "$error" err
)
}
test_expect_success 'fsck catches invalid checksum' '
revfile=$(ls corrupt/.git/objects/pack/pack-*.rev) &&
orig_size=$(wc -c <$revfile) &&
hashpos=$((orig_size - 10)) &&
corrupt_rev_and_verify $hashpos bogus \
"invalid checksum"
'
test_expect_success 'fsck catches invalid row position' '
corrupt_rev_and_verify 14 "\07" \
"invalid rev-index position"
'
test_expect_success 'fsck catches invalid header: magic number' '
corrupt_rev_and_verify 1 "\07" \
"reverse-index file .* has unknown signature"
'
test_expect_success 'fsck catches invalid header: version' '
corrupt_rev_and_verify 7 "\02" \
"reverse-index file .* has unsupported version"
'
test_expect_success 'fsck catches invalid header: hash function' '
corrupt_rev_and_verify 11 "\03" \
"reverse-index file .* has unsupported hash id"
'
test_done