2010-10-22 06:47:19 +00:00
|
|
|
#!/bin/sh
|
|
|
|
|
|
|
|
test_description='basic git gc tests
|
|
|
|
'
|
|
|
|
|
|
|
|
. ./test-lib.sh
|
2018-09-19 21:01:38 +00:00
|
|
|
. "$TEST_DIRECTORY"/lib-terminal.sh
|
2010-10-22 06:47:19 +00:00
|
|
|
|
2018-04-15 15:36:17 +00:00
|
|
|
test_expect_success 'setup' '
|
|
|
|
# do not let the amount of physical memory affects gc
|
|
|
|
# behavior, make sure we always pack everything to one pack by
|
|
|
|
# default
|
2020-07-29 23:13:57 +00:00
|
|
|
git config gc.bigPackThreshold 2g &&
|
|
|
|
|
|
|
|
# These are simply values which, when hashed as a blob with a newline,
|
|
|
|
# produce a hash where the first byte is 0x17 in their respective
|
|
|
|
# algorithms.
|
|
|
|
test_oid_cache <<-EOF
|
|
|
|
obj1 sha1:263
|
|
|
|
obj1 sha256:34
|
|
|
|
|
|
|
|
obj2 sha1:410
|
|
|
|
obj2 sha256:174
|
|
|
|
|
|
|
|
obj3 sha1:523
|
|
|
|
obj3 sha256:313
|
|
|
|
|
|
|
|
obj4 sha1:790
|
|
|
|
obj4 sha256:481
|
|
|
|
EOF
|
2018-04-15 15:36:17 +00:00
|
|
|
'
|
|
|
|
|
2010-10-22 06:47:19 +00:00
|
|
|
test_expect_success 'gc empty repository' '
|
|
|
|
git gc
|
|
|
|
'
|
|
|
|
|
gc: remove gc.pid file at end of execution
This file isn't really harmful, but isn't useful either, and can create
minor annoyance for the user:
* It's confusing, as the presence of a *.pid file often implies that a
process is currently running. A user running "ls .git/" and finding
this file may incorrectly guess that a "git gc" is currently running.
* Leaving this file means that a "git gc" in an already gc-ed repo is
no-longer a no-op. A user running "git gc" in a set of repositories,
and then synchronizing this set (e.g. rsync -av, unison, ...) will see
all the gc.pid files as changed, which creates useless noise.
This patch unlinks the file after the garbage collection is done, so that
gc.pid is actually present only during execution.
Future versions of Git may want to use the information left in the gc.pid
file (e.g. for policies like "don't attempt to run a gc if one has
already been ran less than X hours ago"). If so, this patch can safely be
reverted. For now, let's not bother the users.
Explained-by: Matthieu Moy <Matthieu.Moy@imag.fr>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Improved-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-10-16 23:11:46 +00:00
|
|
|
test_expect_success 'gc does not leave behind pid file' '
|
|
|
|
git gc &&
|
|
|
|
test_path_is_missing .git/gc.pid
|
|
|
|
'
|
|
|
|
|
2010-10-22 06:47:19 +00:00
|
|
|
test_expect_success 'gc --gobbledegook' '
|
|
|
|
test_expect_code 129 git gc --nonsense 2>err &&
|
2012-08-27 05:36:55 +00:00
|
|
|
test_i18ngrep "[Uu]sage: git gc" err
|
2010-10-22 06:47:19 +00:00
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'gc -h with invalid configuration' '
|
|
|
|
mkdir broken &&
|
|
|
|
(
|
|
|
|
cd broken &&
|
|
|
|
git init &&
|
|
|
|
echo "[gc] pruneexpire = CORRUPT" >>.git/config &&
|
|
|
|
test_expect_code 129 git gc -h >usage 2>&1
|
|
|
|
) &&
|
2012-08-27 05:36:55 +00:00
|
|
|
test_i18ngrep "[Uu]sage" broken/usage
|
2010-10-22 06:47:19 +00:00
|
|
|
'
|
|
|
|
|
2015-09-28 14:01:25 +00:00
|
|
|
test_expect_success 'gc is not aborted due to a stale symref' '
|
2015-09-28 14:01:13 +00:00
|
|
|
git init remote &&
|
|
|
|
(
|
|
|
|
cd remote &&
|
|
|
|
test_commit initial &&
|
|
|
|
git clone . ../client &&
|
|
|
|
git branch -m develop &&
|
|
|
|
cd ../client &&
|
|
|
|
git fetch --prune &&
|
|
|
|
git gc
|
|
|
|
)
|
|
|
|
'
|
|
|
|
|
2018-04-15 15:36:14 +00:00
|
|
|
test_expect_success 'gc --keep-largest-pack' '
|
|
|
|
test_create_repo keep-pack &&
|
|
|
|
(
|
|
|
|
cd keep-pack &&
|
|
|
|
test_commit one &&
|
|
|
|
test_commit two &&
|
|
|
|
test_commit three &&
|
|
|
|
git gc &&
|
|
|
|
( cd .git/objects/pack && ls *.pack ) >pack-list &&
|
|
|
|
test_line_count = 1 pack-list &&
|
2021-02-12 13:29:41 +00:00
|
|
|
cp pack-list base-pack-list &&
|
2018-04-15 15:36:14 +00:00
|
|
|
test_commit four &&
|
|
|
|
git repack -d &&
|
|
|
|
test_commit five &&
|
|
|
|
git repack -d &&
|
|
|
|
( cd .git/objects/pack && ls *.pack ) >pack-list &&
|
|
|
|
test_line_count = 3 pack-list &&
|
|
|
|
git gc --keep-largest-pack &&
|
|
|
|
( cd .git/objects/pack && ls *.pack ) >pack-list &&
|
|
|
|
test_line_count = 2 pack-list &&
|
2019-05-23 17:27:23 +00:00
|
|
|
awk "/^P /{print \$2}" <.git/objects/info/packs >pack-info &&
|
|
|
|
test_line_count = 2 pack-info &&
|
2021-02-12 13:29:41 +00:00
|
|
|
test_path_is_file .git/objects/pack/$(cat base-pack-list) &&
|
2018-04-15 15:36:14 +00:00
|
|
|
git fsck
|
|
|
|
)
|
|
|
|
'
|
|
|
|
|
2021-06-14 10:39:25 +00:00
|
|
|
test_expect_success 'pre-auto-gc hook can stop auto gc' '
|
|
|
|
cat >err.expect <<-\EOF &&
|
|
|
|
no gc for you
|
|
|
|
EOF
|
|
|
|
|
|
|
|
git init pre-auto-gc-hook &&
|
2022-03-17 10:13:12 +00:00
|
|
|
test_hook -C pre-auto-gc-hook pre-auto-gc <<-\EOF &&
|
|
|
|
echo >&2 no gc for you &&
|
|
|
|
exit 1
|
|
|
|
EOF
|
2021-06-14 10:39:25 +00:00
|
|
|
(
|
|
|
|
cd pre-auto-gc-hook &&
|
|
|
|
|
|
|
|
git config gc.auto 3 &&
|
|
|
|
git config gc.autoDetach false &&
|
|
|
|
|
|
|
|
# We need to create two object whose sha1s start with 17
|
|
|
|
# since this is what git gc counts. As it happens, these
|
|
|
|
# two blobs will do so.
|
|
|
|
test_commit "$(test_oid obj1)" &&
|
|
|
|
test_commit "$(test_oid obj2)" &&
|
|
|
|
|
|
|
|
git gc --auto >../out.actual 2>../err.actual
|
|
|
|
) &&
|
|
|
|
test_must_be_empty out.actual &&
|
|
|
|
test_cmp err.expect err.actual &&
|
|
|
|
|
|
|
|
cat >err.expect <<-\EOF &&
|
|
|
|
will gc for you
|
|
|
|
Auto packing the repository for optimum performance.
|
|
|
|
See "git help gc" for manual housekeeping.
|
|
|
|
EOF
|
|
|
|
|
2022-03-17 10:13:12 +00:00
|
|
|
test_hook -C pre-auto-gc-hook --clobber pre-auto-gc <<-\EOF &&
|
|
|
|
echo >&2 will gc for you &&
|
|
|
|
exit 0
|
|
|
|
EOF
|
|
|
|
|
|
|
|
git -C pre-auto-gc-hook gc --auto >out.actual 2>err.actual &&
|
2021-06-14 10:39:25 +00:00
|
|
|
|
|
|
|
test_must_be_empty out.actual &&
|
|
|
|
test_cmp err.expect err.actual
|
|
|
|
'
|
|
|
|
|
2016-12-28 22:45:41 +00:00
|
|
|
test_expect_success 'auto gc with too many loose objects does not attempt to create bitmaps' '
|
|
|
|
test_config gc.auto 3 &&
|
|
|
|
test_config gc.autodetach false &&
|
|
|
|
test_config pack.writebitmaps true &&
|
|
|
|
# We need to create two object whose sha1s start with 17
|
|
|
|
# since this is what git gc counts. As it happens, these
|
|
|
|
# two blobs will do so.
|
2020-07-29 23:13:57 +00:00
|
|
|
test_commit "$(test_oid obj1)" &&
|
|
|
|
test_commit "$(test_oid obj2)" &&
|
2016-12-28 22:45:41 +00:00
|
|
|
# Our first gc will create a pack; our second will create a second pack
|
|
|
|
git gc --auto &&
|
2021-01-25 23:37:38 +00:00
|
|
|
ls .git/objects/pack/pack-*.pack | sort >existing_packs &&
|
2020-07-29 23:13:57 +00:00
|
|
|
test_commit "$(test_oid obj3)" &&
|
|
|
|
test_commit "$(test_oid obj4)" &&
|
2016-12-28 22:45:41 +00:00
|
|
|
|
|
|
|
git gc --auto 2>err &&
|
|
|
|
test_i18ngrep ! "^warning:" err &&
|
2021-01-25 23:37:38 +00:00
|
|
|
ls .git/objects/pack/pack-*.pack | sort >post_packs &&
|
2016-12-28 22:45:41 +00:00
|
|
|
comm -1 -3 existing_packs post_packs >new &&
|
|
|
|
comm -2 -3 existing_packs post_packs >del &&
|
|
|
|
test_line_count = 0 del && # No packs are deleted
|
2021-01-25 23:37:38 +00:00
|
|
|
test_line_count = 1 new # There is one new pack
|
2016-12-28 22:45:41 +00:00
|
|
|
'
|
|
|
|
|
2018-09-19 21:01:38 +00:00
|
|
|
test_expect_success 'gc --no-quiet' '
|
2019-11-25 21:28:22 +00:00
|
|
|
GIT_PROGRESS_DELAY=0 git -c gc.writeCommitGraph=true gc --no-quiet >stdout 2>stderr &&
|
2018-09-19 21:01:38 +00:00
|
|
|
test_must_be_empty stdout &&
|
|
|
|
test_i18ngrep "Computing commit graph generation numbers" stderr
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success TTY 'with TTY: gc --no-quiet' '
|
2019-11-25 21:28:23 +00:00
|
|
|
test_terminal env GIT_PROGRESS_DELAY=0 \
|
|
|
|
git -c gc.writeCommitGraph=true gc --no-quiet >stdout 2>stderr &&
|
2018-09-19 21:01:38 +00:00
|
|
|
test_must_be_empty stdout &&
|
|
|
|
test_i18ngrep "Enumerating objects" stderr &&
|
|
|
|
test_i18ngrep "Computing commit graph generation numbers" stderr
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'gc --quiet' '
|
|
|
|
git -c gc.writeCommitGraph=true gc --quiet >stdout 2>stderr &&
|
|
|
|
test_must_be_empty stdout &&
|
|
|
|
test_must_be_empty stderr
|
|
|
|
'
|
|
|
|
|
gc: handle & check gc.reflogExpire config
Don't redundantly run "git reflog expire --all" when gc.reflogExpire
and gc.reflogExpireUnreachable are set to "never", and die immediately
if those configuration valuer are bad.
As an earlier "assert lack of early exit" change to the tests for "git
reflog expire" shows, an early check of gc.reflogExpire{Unreachable,}
isn't wanted in general for "git reflog expire", but it makes sense
for "gc" because:
1) Similarly to 8ab5aa4bd8 ("parseopt: handle malformed --expire
arguments more nicely", 2018-04-21) we'll now die early if the
config variables are set to invalid values.
We run "pack-refs" before "reflog expire", which can take a while,
only to then die on an invalid gc.reflogExpire{Unreachable,}
configuration.
2) Not invoking the command at all means it won't show up in trace
output, which makes what's going on more obvious when the two are
set to "never".
3) As a later change documents we lock the refs when looping over the
refs to expire, even in cases where we end up doing nothing due to
this config.
For the reasons noted in the earlier "assert lack of early exit"
change I don't think it's worth it to bend over backwards in "git
reflog expire" itself to carefully detect if we'll really do
nothing given the combination of all its possible options and skip
that locking, but that's easy to detect here in "gc" where we'll
only run "reflog expire" in a relatively simple mode.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-03-28 16:14:34 +00:00
|
|
|
test_expect_success 'gc.reflogExpire{Unreachable,}=never skips "expire" via "gc"' '
|
|
|
|
test_config gc.reflogExpire never &&
|
|
|
|
test_config gc.reflogExpireUnreachable never &&
|
|
|
|
|
|
|
|
GIT_TRACE=$(pwd)/trace.out git gc &&
|
|
|
|
|
|
|
|
# Check that git-pack-refs is run as a sanity check (done via
|
|
|
|
# gc_before_repack()) but that git-expire is not.
|
|
|
|
grep -E "^trace: (built-in|exec|run_command): git pack-refs --" trace.out &&
|
|
|
|
! grep -E "^trace: (built-in|exec|run_command): git reflog expire --" trace.out
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'one of gc.reflogExpire{Unreachable,}=never does not skip "expire" via "gc"' '
|
|
|
|
>trace.out &&
|
|
|
|
test_config gc.reflogExpire never &&
|
|
|
|
GIT_TRACE=$(pwd)/trace.out git gc &&
|
|
|
|
grep -E "^trace: (built-in|exec|run_command): git reflog expire --" trace.out
|
|
|
|
'
|
|
|
|
|
2023-10-02 16:55:02 +00:00
|
|
|
test_expect_success 'gc.repackFilter launches repack with a filter' '
|
|
|
|
git clone --no-local --bare . bare.git &&
|
|
|
|
|
|
|
|
git -C bare.git -c gc.cruftPacks=false gc &&
|
|
|
|
test_stdout_line_count = 1 ls bare.git/objects/pack/*.pack &&
|
|
|
|
|
|
|
|
GIT_TRACE=$(pwd)/trace.out git -C bare.git -c gc.repackFilter=blob:none \
|
|
|
|
-c repack.writeBitmaps=false -c gc.cruftPacks=false gc &&
|
|
|
|
test_stdout_line_count = 2 ls bare.git/objects/pack/*.pack &&
|
|
|
|
grep -E "^trace: (built-in|exec|run_command): git repack .* --filter=blob:none ?.*" trace.out
|
|
|
|
'
|
|
|
|
|
2023-10-02 16:55:04 +00:00
|
|
|
test_expect_success 'gc.repackFilterTo store filtered out objects' '
|
|
|
|
test_when_finished "rm -rf bare.git filtered.git" &&
|
|
|
|
|
|
|
|
git init --bare filtered.git &&
|
|
|
|
git -C bare.git -c gc.repackFilter=blob:none \
|
|
|
|
-c gc.repackFilterTo=../filtered.git/objects/pack/pack \
|
|
|
|
-c repack.writeBitmaps=false -c gc.cruftPacks=false gc &&
|
|
|
|
|
|
|
|
test_stdout_line_count = 1 ls bare.git/objects/pack/*.pack &&
|
|
|
|
test_stdout_line_count = 1 ls filtered.git/objects/pack/*.pack
|
|
|
|
'
|
|
|
|
|
2022-10-26 21:32:43 +00:00
|
|
|
prepare_cruft_history () {
|
|
|
|
test_commit base &&
|
|
|
|
|
|
|
|
test_commit --no-tag foo &&
|
|
|
|
test_commit --no-tag bar &&
|
|
|
|
git reset HEAD^^
|
|
|
|
}
|
|
|
|
|
2022-10-26 21:32:45 +00:00
|
|
|
assert_no_cruft_packs () {
|
|
|
|
find .git/objects/pack -name "*.mtimes" >mtimes &&
|
|
|
|
test_must_be_empty mtimes
|
|
|
|
}
|
|
|
|
|
2023-04-18 20:40:46 +00:00
|
|
|
for argv in \
|
builtin/gc.c: make `gc.cruftPacks` enabled by default
Back in 5b92477f89 (builtin/gc.c: conditionally avoid pruning objects
via loose, 2022-05-20), `git gc` learned the `--cruft` option and
`gc.cruftPacks` configuration to opt-in to writing cruft packs when
collecting or pruning unreachable objects.
Cruft packs were introduced with the merge in a50036da1a (Merge branch
'tb/cruft-packs', 2022-06-03). They address the problem of "loose object
explosions", where Git will write out many individual loose objects when
there is a large number of unreachable objects that have not yet aged
past `--prune=<date>`.
Instead of keeping track of those unreachable yet recent objects via
their loose object file's mtime, cruft packs collect all unreachable
objects into a single pack with a corresponding `*.mtimes` file that
acts as a table to store the mtimes of all unreachable objects. This
prevents the need to store unreachable objects as loose as they age out
of the repository, and avoids the problem of loose object explosions.
Beyond avoiding loose object explosions, cruft packs also act as a more
efficient mechanism to store unreachable objects as they age out of a
repository. This is because pairs of similar unreachable objects serve
as delta bases for one another.
In 5b92477f89, the feature was introduced as experimental. Since then,
GitHub has been running these patches in every repository generating
hundreds of millions of cruft packs along the way. The feature is
battle-tested, and avoids many pathological cases such as above. Users
who either run `git gc` manually, or via `git maintenance` can benefit
from having cruft packs.
As such, enable cruft pack generation to take place by default (by
making `gc.cruftPacks` have the default of "true" rather than "false).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-18 20:40:57 +00:00
|
|
|
"gc" \
|
2023-04-18 20:40:46 +00:00
|
|
|
"-c gc.cruftPacks=true gc" \
|
builtin/gc.c: make `gc.cruftPacks` enabled by default
Back in 5b92477f89 (builtin/gc.c: conditionally avoid pruning objects
via loose, 2022-05-20), `git gc` learned the `--cruft` option and
`gc.cruftPacks` configuration to opt-in to writing cruft packs when
collecting or pruning unreachable objects.
Cruft packs were introduced with the merge in a50036da1a (Merge branch
'tb/cruft-packs', 2022-06-03). They address the problem of "loose object
explosions", where Git will write out many individual loose objects when
there is a large number of unreachable objects that have not yet aged
past `--prune=<date>`.
Instead of keeping track of those unreachable yet recent objects via
their loose object file's mtime, cruft packs collect all unreachable
objects into a single pack with a corresponding `*.mtimes` file that
acts as a table to store the mtimes of all unreachable objects. This
prevents the need to store unreachable objects as loose as they age out
of the repository, and avoids the problem of loose object explosions.
Beyond avoiding loose object explosions, cruft packs also act as a more
efficient mechanism to store unreachable objects as they age out of a
repository. This is because pairs of similar unreachable objects serve
as delta bases for one another.
In 5b92477f89, the feature was introduced as experimental. Since then,
GitHub has been running these patches in every repository generating
hundreds of millions of cruft packs along the way. The feature is
battle-tested, and avoids many pathological cases such as above. Users
who either run `git gc` manually, or via `git maintenance` can benefit
from having cruft packs.
As such, enable cruft pack generation to take place by default (by
making `gc.cruftPacks` have the default of "true" rather than "false).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-18 20:40:57 +00:00
|
|
|
"-c gc.cruftPacks=false gc --cruft"
|
2023-04-18 20:40:46 +00:00
|
|
|
do
|
|
|
|
test_expect_success "git $argv generates a cruft pack" '
|
|
|
|
test_when_finished "rm -fr repo" &&
|
|
|
|
git init repo &&
|
|
|
|
(
|
|
|
|
cd repo &&
|
|
|
|
|
|
|
|
prepare_cruft_history &&
|
|
|
|
git $argv &&
|
|
|
|
|
|
|
|
find .git/objects/pack -name "*.mtimes" >mtimes &&
|
|
|
|
sed -e 's/\.mtimes$/\.pack/g' mtimes >packs &&
|
|
|
|
|
|
|
|
test_file_not_empty packs &&
|
|
|
|
while read pack
|
|
|
|
do
|
|
|
|
test_path_is_file "$pack" || return 1
|
|
|
|
done <packs
|
|
|
|
)
|
|
|
|
'
|
|
|
|
done
|
|
|
|
|
|
|
|
for argv in \
|
builtin/gc.c: make `gc.cruftPacks` enabled by default
Back in 5b92477f89 (builtin/gc.c: conditionally avoid pruning objects
via loose, 2022-05-20), `git gc` learned the `--cruft` option and
`gc.cruftPacks` configuration to opt-in to writing cruft packs when
collecting or pruning unreachable objects.
Cruft packs were introduced with the merge in a50036da1a (Merge branch
'tb/cruft-packs', 2022-06-03). They address the problem of "loose object
explosions", where Git will write out many individual loose objects when
there is a large number of unreachable objects that have not yet aged
past `--prune=<date>`.
Instead of keeping track of those unreachable yet recent objects via
their loose object file's mtime, cruft packs collect all unreachable
objects into a single pack with a corresponding `*.mtimes` file that
acts as a table to store the mtimes of all unreachable objects. This
prevents the need to store unreachable objects as loose as they age out
of the repository, and avoids the problem of loose object explosions.
Beyond avoiding loose object explosions, cruft packs also act as a more
efficient mechanism to store unreachable objects as they age out of a
repository. This is because pairs of similar unreachable objects serve
as delta bases for one another.
In 5b92477f89, the feature was introduced as experimental. Since then,
GitHub has been running these patches in every repository generating
hundreds of millions of cruft packs along the way. The feature is
battle-tested, and avoids many pathological cases such as above. Users
who either run `git gc` manually, or via `git maintenance` can benefit
from having cruft packs.
As such, enable cruft pack generation to take place by default (by
making `gc.cruftPacks` have the default of "true" rather than "false).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-18 20:40:57 +00:00
|
|
|
"gc --no-cruft" \
|
2023-04-18 20:40:49 +00:00
|
|
|
"-c gc.cruftPacks=false gc" \
|
builtin/gc.c: make `gc.cruftPacks` enabled by default
Back in 5b92477f89 (builtin/gc.c: conditionally avoid pruning objects
via loose, 2022-05-20), `git gc` learned the `--cruft` option and
`gc.cruftPacks` configuration to opt-in to writing cruft packs when
collecting or pruning unreachable objects.
Cruft packs were introduced with the merge in a50036da1a (Merge branch
'tb/cruft-packs', 2022-06-03). They address the problem of "loose object
explosions", where Git will write out many individual loose objects when
there is a large number of unreachable objects that have not yet aged
past `--prune=<date>`.
Instead of keeping track of those unreachable yet recent objects via
their loose object file's mtime, cruft packs collect all unreachable
objects into a single pack with a corresponding `*.mtimes` file that
acts as a table to store the mtimes of all unreachable objects. This
prevents the need to store unreachable objects as loose as they age out
of the repository, and avoids the problem of loose object explosions.
Beyond avoiding loose object explosions, cruft packs also act as a more
efficient mechanism to store unreachable objects as they age out of a
repository. This is because pairs of similar unreachable objects serve
as delta bases for one another.
In 5b92477f89, the feature was introduced as experimental. Since then,
GitHub has been running these patches in every repository generating
hundreds of millions of cruft packs along the way. The feature is
battle-tested, and avoids many pathological cases such as above. Users
who either run `git gc` manually, or via `git maintenance` can benefit
from having cruft packs.
As such, enable cruft pack generation to take place by default (by
making `gc.cruftPacks` have the default of "true" rather than "false).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-18 20:40:57 +00:00
|
|
|
"-c gc.cruftPacks=true gc --no-cruft"
|
2023-04-18 20:40:46 +00:00
|
|
|
do
|
|
|
|
test_expect_success "git $argv does not generate a cruft pack" '
|
|
|
|
test_when_finished "rm -fr repo" &&
|
|
|
|
git init repo &&
|
|
|
|
(
|
|
|
|
cd repo &&
|
|
|
|
|
|
|
|
prepare_cruft_history &&
|
|
|
|
git $argv &&
|
|
|
|
|
|
|
|
assert_no_cruft_packs
|
|
|
|
)
|
|
|
|
'
|
|
|
|
done
|
2022-10-26 21:32:45 +00:00
|
|
|
|
builtin/gc.c: ignore cruft packs with `--keep-largest-pack`
When cruft packs were implemented, we never adjusted the code for `git
gc`'s `--keep-largest-pack` and `gc.bigPackThreshold` to ignore cruft
packs. This option and configuration option share a common
implementation, but including cruft packs is wrong in both cases:
- Running `git gc --keep-largest-pack` in a repository where the
largest pack is the cruft pack itself will make it impossible for
`git gc` to prune objects, since the cruft pack itself is kept.
- The same is true for `gc.bigPackThreshold`, if the size of the cruft
pack exceeds the limit set by the caller.
In the future, it is possible that `gc.bigPackThreshold` could be used
to write a separate cruft pack containing any new unreachable objects
that entered the repository since the last time a cruft pack was
written.
There are some complexities to doing so, mainly around handling
pruning objects that are in an existing cruft pack that is above the
threshold (which would either need to be rewritten, or else delay
pruning). Rewriting a substantially similar cruft pack isn't ideal, but
it is significantly better than the status-quo.
If users have large cruft packs that they don't want to rewrite, they
can mark them as `*.keep` packs. But in general, if a repository has a
cruft pack that is so large it is slowing down GC's, it should probably
be pruned anyway.
In the meantime, ignore cruft packs in the common implementation for
both of these options, and add a pair of tests to prevent any future
regressions here.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-04-18 20:40:38 +00:00
|
|
|
test_expect_success '--keep-largest-pack ignores cruft packs' '
|
|
|
|
test_when_finished "rm -fr repo" &&
|
|
|
|
git init repo &&
|
|
|
|
(
|
|
|
|
cd repo &&
|
|
|
|
|
|
|
|
# Generate a pack for reachable objects (of which there
|
|
|
|
# are 3), and one for unreachable objects (of which
|
|
|
|
# there are 6).
|
|
|
|
prepare_cruft_history &&
|
|
|
|
git gc --cruft &&
|
|
|
|
|
|
|
|
mtimes="$(find .git/objects/pack -type f -name "pack-*.mtimes")" &&
|
|
|
|
sz="$(test_file_size "${mtimes%.mtimes}.pack")" &&
|
|
|
|
|
|
|
|
# Ensure that the cruft pack gets removed (due to
|
|
|
|
# `--prune=now`) despite it being the largest pack.
|
|
|
|
git -c gc.bigPackThreshold=$sz gc --cruft --prune=now &&
|
|
|
|
|
|
|
|
assert_no_cruft_packs
|
|
|
|
)
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'gc.bigPackThreshold ignores cruft packs' '
|
|
|
|
test_when_finished "rm -fr repo" &&
|
|
|
|
git init repo &&
|
|
|
|
(
|
|
|
|
cd repo &&
|
|
|
|
|
|
|
|
# Generate a pack for reachable objects (of which there
|
|
|
|
# are 3), and one for unreachable objects (of which
|
|
|
|
# there are 6).
|
|
|
|
prepare_cruft_history &&
|
|
|
|
git gc --cruft &&
|
|
|
|
|
|
|
|
# Ensure that the cruft pack gets removed (due to
|
|
|
|
# `--prune=now`) despite it being the largest pack.
|
|
|
|
git gc --cruft --prune=now --keep-largest-pack &&
|
|
|
|
|
|
|
|
assert_no_cruft_packs
|
|
|
|
)
|
|
|
|
'
|
|
|
|
|
builtin/repack.c: implement support for `--max-cruft-size`
Cruft packs are an alternative mechanism for storing a collection of
unreachable objects whose mtimes are recent enough to avoid being
pruned out of the repository.
When cruft packs were first introduced back in b757353676
(builtin/pack-objects.c: --cruft without expiration, 2022-05-20) and
a7d493833f (builtin/pack-objects.c: --cruft with expiration,
2022-05-20), the recommended workflow consisted of:
- Repacking periodically, either by packing anything loose in the
repository (via `git repack -d`) or producing a geometric sequence
of packs (via `git repack --geometric=<d> -d`).
- Every so often, splitting the repository into two packs, one cruft
to store the unreachable objects, and another non-cruft pack to
store the reachable objects.
Repositories may (out of band with the above) choose periodically to
prune out some unreachable objects which have aged out of the grace
period by generating a pack with `--cruft-expiration=<approxidate>`.
This allowed repositories to maintain relatively few packs on average,
and quarantine unreachable objects together in a cruft pack, avoiding
the pitfalls of holding unreachable objects as loose while they age out
(for more, see some of the details in 3d89a8c118
(Documentation/technical: add cruft-packs.txt, 2022-05-20)).
This all works, but can be costly from an I/O-perspective when
frequently repacking a repository that has many unreachable objects.
This problem is exacerbated when those unreachable objects are rarely
(if every) pruned.
Since there is at most one cruft pack in the above scheme, each time we
update the cruft pack it must be rewritten from scratch. Because much of
the pack is reused, this is a relatively inexpensive operation from a
CPU-perspective, but is very costly in terms of I/O since we end up
rewriting basically the same pack (plus any new unreachable objects that
have entered the repository since the last time a cruft pack was
generated).
At the time, we decided against implementing more robust support for
multiple cruft packs. This patch implements that support which we were
lacking.
Introduce a new option `--max-cruft-size` which allows repositories to
accumulate cruft packs up to a given size, after which point a new
generation of cruft packs can accumulate until it reaches the maximum
size, and so on. To generate a new cruft pack, the process works like
so:
- Sort a list of any existing cruft packs in ascending order of pack
size.
- Starting from the beginning of the list, group cruft packs together
while the accumulated size is smaller than the maximum specified
pack size.
- Combine the objects in these cruft packs together into a new cruft
pack, along with any other unreachable objects which have since
entered the repository.
Once a cruft pack grows beyond the size specified via `--max-cruft-size`
the pack is effectively frozen. This limits the I/O churn up to a
quadratic function of the value specified by the `--max-cruft-size`
option, instead of behaving quadratically in the number of total
unreachable objects.
When pruning unreachable objects, we bypass the new code paths which
combine small cruft packs together, and instead start from scratch,
passing in the appropriate `--max-pack-size` down to `pack-objects`,
putting it in charge of keeping the resulting set of cruft packs sized
correctly.
This may seem like further I/O churn, but in practice it isn't so bad.
We could prune old cruft packs for whom all or most objects are removed,
and then generate a new cruft pack with just the remaining set of
objects. But this additional complexity buys us relatively little,
because most objects end up being pruned anyway, so the I/O churn is
well contained.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-03 00:44:32 +00:00
|
|
|
cruft_max_size_opts="git repack -d -l --cruft --cruft-expiration=2.weeks.ago"
|
|
|
|
|
|
|
|
test_expect_success 'setup for --max-cruft-size tests' '
|
|
|
|
git init cruft--max-size &&
|
|
|
|
(
|
|
|
|
cd cruft--max-size &&
|
|
|
|
prepare_cruft_history
|
|
|
|
)
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success '--max-cruft-size sets appropriate repack options' '
|
|
|
|
GIT_TRACE2_EVENT=$(pwd)/trace2.txt git -C cruft--max-size \
|
|
|
|
gc --cruft --max-cruft-size=1M &&
|
|
|
|
test_subcommand $cruft_max_size_opts --max-cruft-size=1048576 <trace2.txt
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'gc.maxCruftSize sets appropriate repack options' '
|
|
|
|
GIT_TRACE2_EVENT=$(pwd)/trace2.txt \
|
|
|
|
git -C cruft--max-size -c gc.maxCruftSize=2M gc --cruft &&
|
|
|
|
test_subcommand $cruft_max_size_opts --max-cruft-size=2097152 <trace2.txt &&
|
|
|
|
|
|
|
|
GIT_TRACE2_EVENT=$(pwd)/trace2.txt \
|
|
|
|
git -C cruft--max-size -c gc.maxCruftSize=2M gc --cruft \
|
|
|
|
--max-cruft-size=3M &&
|
|
|
|
test_subcommand $cruft_max_size_opts --max-cruft-size=3145728 <trace2.txt
|
|
|
|
'
|
|
|
|
|
t6500: wait for detached auto gc at the end of the test script
The last test in 't6500-gc', 'background auto gc does not run if
gc.log is present and recent but does if it is old', added in
a831c06a2 (gc: ignore old gc.log files, 2017-02-10), may sporadically
trigger an error message from the test harness:
rm: cannot remove 'trash directory.t6500-gc/.git/objects': Directory not empty
The test in question ends with executing an auto gc in the backround,
which occasionally takes so long that it's still running when
'test_done' is about to remove the trash directory. This 'rm -rf
$trash' in the foreground might race with the detached auto gc to
create and delete files and directories, and gc might (re-)create a
path that 'rm' already visited and removed, triggering the above error
message when 'rm' attempts to remove its parent directory.
Commit bb05510e5 (t5510: run auto-gc in the foreground, 2016-05-01)
fixed the same problem in a different test script by simply
disallowing background gc. Unfortunately, what worked there is not
applicable here, because the purpose of this test is to check the
behavior of a detached auto gc.
Make sure that the test doesn't continue before the gc is finished in
the background with a clever bit of shell trickery:
- Open fd 9 in the shell, to be inherited by the background gc
process, because our daemonize() only closes the standard fds 0,
1 and 2.
- Duplicate this fd 9 to stdout.
- Read 'git gc's stdout, and thus fd 9, through a command
substitution. We don't actually care about gc's output, but this
construct has two useful properties:
- This read blocks until stdout or fd 9 are open. While stdout is
closed after the main gc process creates the background process
and exits, fd 9 remains open until the backround process exits.
- The variable assignment from the command substitution gets its
exit status from the command executed within the command
substitution, i.e. a failing main gc process will cause the test
to fail.
Note, that this fd trickery doesn't work on Windows, because due to
MSYS limitations the git process only inherits the standard fds 0, 1
and 2 from the shell. Luckily, it doesn't matter in this case,
because on Windows daemonize() is basically a noop, thus 'git gc
--auto' always runs in the foreground.
And since we can now continue the test reliably after the detached gc
finished, check that there is only a single packfile left at the end,
i.e. that the detached gc actually did what it was supposed to do.
Also add a comment at the end of the test script to warn developers of
future tests about this issue of long running detached gc processes.
Helped-by: Jeff King <peff@peff.net>
Helped-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-04-13 10:31:38 +00:00
|
|
|
run_and_wait_for_auto_gc () {
|
|
|
|
# We read stdout from gc for the side effect of waiting until the
|
|
|
|
# background gc process exits, closing its fd 9. Furthermore, the
|
|
|
|
# variable assignment from a command substitution preserves the
|
|
|
|
# exit status of the main gc process.
|
|
|
|
# Note: this fd trickery doesn't work on Windows, but there is no
|
|
|
|
# need to, because on Win the auto gc always runs in the foreground.
|
|
|
|
doesnt_matter=$(git gc --auto 9>&1)
|
|
|
|
}
|
|
|
|
|
gc: ignore old gc.log files
A server can end up in a state where there are lots of unreferenced
loose objects (say, because many users are doing a bunch of rebasing
and pushing their rebased branches). Running "git gc --auto" in
this state would cause a gc.log file to be created, preventing
future auto gcs, causing pack files to pile up. Since many git
operations are O(n) in the number of pack files, this would lead to
poor performance.
Git should never get itself into a state where it refuses to do any
maintenance, just because at some point some piece of the maintenance
didn't make progress.
Teach Git to ignore gc.log files which are older than (by default)
one day old, which can be tweaked via the gc.logExpiry configuration
variable. That way, these pack files will get cleaned up, if
necessary, at least once per day. And operators who find a need for
more-frequent gcs can adjust gc.logExpiry to meet their needs.
There is also some cleanup: a successful manual gc, or a
warning-free auto gc with an old log file, will remove any old
gc.log files.
It might still happen that manual intervention is required
(e.g. because the repo is corrupt), but at the very least it won't
be because Git is too dumb to try again.
Signed-off-by: David Turner <dturner@twosigma.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-10 21:28:22 +00:00
|
|
|
test_expect_success 'background auto gc does not run if gc.log is present and recent but does if it is old' '
|
|
|
|
test_commit foo &&
|
|
|
|
test_commit bar &&
|
|
|
|
git repack &&
|
|
|
|
test_config gc.autopacklimit 1 &&
|
|
|
|
test_config gc.autodetach true &&
|
|
|
|
echo fleem >.git/gc.log &&
|
gc: do not return error for prior errors in daemonized mode
Some build machines started consistently failing to fetch updated
source using "repo sync", with error
error: The last gc run reported the following. Please correct the root cause
and remove /build/.repo/projects/tools/git.git/gc.log.
Automatic cleanup will not be performed until the file is removed.
warning: There are too many unreachable loose objects; run 'git prune' to remove them.
The cause takes some time to describe.
In v2.0.0-rc0~145^2 (gc: config option for running --auto in
background, 2014-02-08), "git gc --auto" learned to run in the
background instead of blocking the invoking command. In this mode, it
closed stderr to avoid interleaving output with any subsequent
commands, causing warnings like the above to be swallowed; v2.6.3~24^2
(gc: save log from daemonized gc --auto and print it next time,
2015-09-19) addressed that by storing any diagnostic output in
.git/gc.log and allowing the next "git gc --auto" run to print it.
To avoid wasteful repeated fruitless gcs, when gc.log is present, the
subsequent "gc --auto" would die after printing its contents. Most
git commands, such as "git fetch", ignore the exit status from "git gc
--auto" so all is well at this point: the user gets to see the error
message, and the fetch succeeds, without a wasteful additional attempt
at an automatic gc.
External tools like repo[1], though, do care about the exit status
from "git gc --auto". In non-daemonized mode, the exit status is
straightforward: if there is an error, it is nonzero, but after a
warning like the above, the status is zero. The daemonized mode, as a
side effect of the other properties provided, offers a very strange
exit code convention:
- if no housekeeping was required, the exit status is 0
- the first real run, after forking into the background, returns exit
status 0 unconditionally. The parent process has no way to know
whether gc will succeed.
- if there is any diagnostic output in gc.log, subsequent runs return
a nonzero exit status to indicate that gc was not triggered.
There's nothing for the calling program to act on on the basis of that
error. Use status 0 consistently instead, to indicate that we decided
not to run a gc (just like if no housekeeping was required). This
way, repo and similar tools can get the benefit of the same behavior
as tools like "git fetch" that ignore the exit status from gc --auto.
Once the period of time described by gc.pruneExpire elapses, the
unreachable loose objects will be removed by "git gc --auto"
automatically.
[1] https://gerrit-review.googlesource.com/c/git-repo/+/10598/
Reported-by: Andrii Dehtiarov <adehtiarov@google.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-17 06:57:40 +00:00
|
|
|
git gc --auto 2>err &&
|
|
|
|
test_i18ngrep "^warning:" err &&
|
gc: ignore old gc.log files
A server can end up in a state where there are lots of unreferenced
loose objects (say, because many users are doing a bunch of rebasing
and pushing their rebased branches). Running "git gc --auto" in
this state would cause a gc.log file to be created, preventing
future auto gcs, causing pack files to pile up. Since many git
operations are O(n) in the number of pack files, this would lead to
poor performance.
Git should never get itself into a state where it refuses to do any
maintenance, just because at some point some piece of the maintenance
didn't make progress.
Teach Git to ignore gc.log files which are older than (by default)
one day old, which can be tweaked via the gc.logExpiry configuration
variable. That way, these pack files will get cleaned up, if
necessary, at least once per day. And operators who find a need for
more-frequent gcs can adjust gc.logExpiry to meet their needs.
There is also some cleanup: a successful manual gc, or a
warning-free auto gc with an old log file, will remove any old
gc.log files.
It might still happen that manual intervention is required
(e.g. because the repo is corrupt), but at the very least it won't
be because Git is too dumb to try again.
Signed-off-by: David Turner <dturner@twosigma.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-10 21:28:22 +00:00
|
|
|
test_config gc.logexpiry 5.days &&
|
2018-03-24 07:44:31 +00:00
|
|
|
test-tool chmtime =-345600 .git/gc.log &&
|
gc: do not return error for prior errors in daemonized mode
Some build machines started consistently failing to fetch updated
source using "repo sync", with error
error: The last gc run reported the following. Please correct the root cause
and remove /build/.repo/projects/tools/git.git/gc.log.
Automatic cleanup will not be performed until the file is removed.
warning: There are too many unreachable loose objects; run 'git prune' to remove them.
The cause takes some time to describe.
In v2.0.0-rc0~145^2 (gc: config option for running --auto in
background, 2014-02-08), "git gc --auto" learned to run in the
background instead of blocking the invoking command. In this mode, it
closed stderr to avoid interleaving output with any subsequent
commands, causing warnings like the above to be swallowed; v2.6.3~24^2
(gc: save log from daemonized gc --auto and print it next time,
2015-09-19) addressed that by storing any diagnostic output in
.git/gc.log and allowing the next "git gc --auto" run to print it.
To avoid wasteful repeated fruitless gcs, when gc.log is present, the
subsequent "gc --auto" would die after printing its contents. Most
git commands, such as "git fetch", ignore the exit status from "git gc
--auto" so all is well at this point: the user gets to see the error
message, and the fetch succeeds, without a wasteful additional attempt
at an automatic gc.
External tools like repo[1], though, do care about the exit status
from "git gc --auto". In non-daemonized mode, the exit status is
straightforward: if there is an error, it is nonzero, but after a
warning like the above, the status is zero. The daemonized mode, as a
side effect of the other properties provided, offers a very strange
exit code convention:
- if no housekeeping was required, the exit status is 0
- the first real run, after forking into the background, returns exit
status 0 unconditionally. The parent process has no way to know
whether gc will succeed.
- if there is any diagnostic output in gc.log, subsequent runs return
a nonzero exit status to indicate that gc was not triggered.
There's nothing for the calling program to act on on the basis of that
error. Use status 0 consistently instead, to indicate that we decided
not to run a gc (just like if no housekeeping was required). This
way, repo and similar tools can get the benefit of the same behavior
as tools like "git fetch" that ignore the exit status from gc --auto.
Once the period of time described by gc.pruneExpire elapses, the
unreachable loose objects will be removed by "git gc --auto"
automatically.
[1] https://gerrit-review.googlesource.com/c/git-repo/+/10598/
Reported-by: Andrii Dehtiarov <adehtiarov@google.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-17 06:57:40 +00:00
|
|
|
git gc --auto &&
|
gc: ignore old gc.log files
A server can end up in a state where there are lots of unreferenced
loose objects (say, because many users are doing a bunch of rebasing
and pushing their rebased branches). Running "git gc --auto" in
this state would cause a gc.log file to be created, preventing
future auto gcs, causing pack files to pile up. Since many git
operations are O(n) in the number of pack files, this would lead to
poor performance.
Git should never get itself into a state where it refuses to do any
maintenance, just because at some point some piece of the maintenance
didn't make progress.
Teach Git to ignore gc.log files which are older than (by default)
one day old, which can be tweaked via the gc.logExpiry configuration
variable. That way, these pack files will get cleaned up, if
necessary, at least once per day. And operators who find a need for
more-frequent gcs can adjust gc.logExpiry to meet their needs.
There is also some cleanup: a successful manual gc, or a
warning-free auto gc with an old log file, will remove any old
gc.log files.
It might still happen that manual intervention is required
(e.g. because the repo is corrupt), but at the very least it won't
be because Git is too dumb to try again.
Signed-off-by: David Turner <dturner@twosigma.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-10 21:28:22 +00:00
|
|
|
test_config gc.logexpiry 2.days &&
|
t6500: wait for detached auto gc at the end of the test script
The last test in 't6500-gc', 'background auto gc does not run if
gc.log is present and recent but does if it is old', added in
a831c06a2 (gc: ignore old gc.log files, 2017-02-10), may sporadically
trigger an error message from the test harness:
rm: cannot remove 'trash directory.t6500-gc/.git/objects': Directory not empty
The test in question ends with executing an auto gc in the backround,
which occasionally takes so long that it's still running when
'test_done' is about to remove the trash directory. This 'rm -rf
$trash' in the foreground might race with the detached auto gc to
create and delete files and directories, and gc might (re-)create a
path that 'rm' already visited and removed, triggering the above error
message when 'rm' attempts to remove its parent directory.
Commit bb05510e5 (t5510: run auto-gc in the foreground, 2016-05-01)
fixed the same problem in a different test script by simply
disallowing background gc. Unfortunately, what worked there is not
applicable here, because the purpose of this test is to check the
behavior of a detached auto gc.
Make sure that the test doesn't continue before the gc is finished in
the background with a clever bit of shell trickery:
- Open fd 9 in the shell, to be inherited by the background gc
process, because our daemonize() only closes the standard fds 0,
1 and 2.
- Duplicate this fd 9 to stdout.
- Read 'git gc's stdout, and thus fd 9, through a command
substitution. We don't actually care about gc's output, but this
construct has two useful properties:
- This read blocks until stdout or fd 9 are open. While stdout is
closed after the main gc process creates the background process
and exits, fd 9 remains open until the backround process exits.
- The variable assignment from the command substitution gets its
exit status from the command executed within the command
substitution, i.e. a failing main gc process will cause the test
to fail.
Note, that this fd trickery doesn't work on Windows, because due to
MSYS limitations the git process only inherits the standard fds 0, 1
and 2 from the shell. Luckily, it doesn't matter in this case,
because on Windows daemonize() is basically a noop, thus 'git gc
--auto' always runs in the foreground.
And since we can now continue the test reliably after the detached gc
finished, check that there is only a single packfile left at the end,
i.e. that the detached gc actually did what it was supposed to do.
Also add a comment at the end of the test script to warn developers of
future tests about this issue of long running detached gc processes.
Helped-by: Jeff King <peff@peff.net>
Helped-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-04-13 10:31:38 +00:00
|
|
|
run_and_wait_for_auto_gc &&
|
|
|
|
ls .git/objects/pack/pack-*.pack >packs &&
|
|
|
|
test_line_count = 1 packs
|
gc: ignore old gc.log files
A server can end up in a state where there are lots of unreferenced
loose objects (say, because many users are doing a bunch of rebasing
and pushing their rebased branches). Running "git gc --auto" in
this state would cause a gc.log file to be created, preventing
future auto gcs, causing pack files to pile up. Since many git
operations are O(n) in the number of pack files, this would lead to
poor performance.
Git should never get itself into a state where it refuses to do any
maintenance, just because at some point some piece of the maintenance
didn't make progress.
Teach Git to ignore gc.log files which are older than (by default)
one day old, which can be tweaked via the gc.logExpiry configuration
variable. That way, these pack files will get cleaned up, if
necessary, at least once per day. And operators who find a need for
more-frequent gcs can adjust gc.logExpiry to meet their needs.
There is also some cleanup: a successful manual gc, or a
warning-free auto gc with an old log file, will remove any old
gc.log files.
It might still happen that manual intervention is required
(e.g. because the repo is corrupt), but at the very least it won't
be because Git is too dumb to try again.
Signed-off-by: David Turner <dturner@twosigma.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-10 21:28:22 +00:00
|
|
|
'
|
2016-12-28 22:45:41 +00:00
|
|
|
|
gc: run pre-detach operations under lock
We normally try to avoid having two auto-gc operations run
at the same time, because it wastes resources. This was done
long ago in 64a99eb47 (gc: reject if another gc is running,
unless --force is given, 2013-08-08).
When we do a detached auto-gc, we run the ref-related
commands _before_ detaching, to avoid confusing lock
contention. This was done by 62aad1849 (gc --auto: do not
lock refs in the background, 2014-05-25).
These two features do not interact well. The pre-detach
operations are run before we check the gc.pid lock, meaning
that on a busy repository we may run many of them
concurrently. Ideally we'd take the lock before spawning any
operations, and hold it for the duration of the program.
This is tricky, though, with the way the pid-file interacts
with the daemonize() process. Other processes will check
that the pid recorded in the pid-file still exists. But
detaching causes us to fork and continue running under a
new pid. So if we take the lock before detaching, the
pid-file will have a bogus pid in it. We'd have to go back
and update it with the new pid after detaching. We'd also
have to play some tricks with the tempfile subsystem to
tweak the "owner" field, so that the parent process does not
clean it up on exit, but the child process does.
Instead, we can do something a bit simpler: take the lock
only for the duration of the pre-detach work, then detach,
then take it again for the post-detach work. Technically,
this means that the post-detach lock could lose to another
process doing pre-detach work. But in the long run this
works out.
That second process would then follow-up by doing
post-detach work. Unless it was in turn blocked by a third
process doing pre-detach work, and so on. This could in
theory go on indefinitely, as the pre-detach work does not
repack, and so need_to_gc() will continue to trigger. But
in each round we are racing between the pre- and post-detach
locks. Eventually, one of the post-detach locks will win the
race and complete the full gc. So in the worst case, we may
racily repeat the pre-detach work, but we would never do so
simultaneously (it would happen via a sequence of serialized
race-wins).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-07-11 09:06:35 +00:00
|
|
|
test_expect_success 'background auto gc respects lock for all operations' '
|
|
|
|
# make sure we run a background auto-gc
|
|
|
|
test_commit make-pack &&
|
|
|
|
git repack &&
|
|
|
|
test_config gc.autopacklimit 1 &&
|
|
|
|
test_config gc.autodetach true &&
|
|
|
|
|
|
|
|
# create a ref whose loose presence we can use to detect a pack-refs run
|
|
|
|
git update-ref refs/heads/should-be-loose HEAD &&
|
2021-08-02 16:53:35 +00:00
|
|
|
(ls -1 .git/refs/heads .git/reftable >expect || true) &&
|
gc: run pre-detach operations under lock
We normally try to avoid having two auto-gc operations run
at the same time, because it wastes resources. This was done
long ago in 64a99eb47 (gc: reject if another gc is running,
unless --force is given, 2013-08-08).
When we do a detached auto-gc, we run the ref-related
commands _before_ detaching, to avoid confusing lock
contention. This was done by 62aad1849 (gc --auto: do not
lock refs in the background, 2014-05-25).
These two features do not interact well. The pre-detach
operations are run before we check the gc.pid lock, meaning
that on a busy repository we may run many of them
concurrently. Ideally we'd take the lock before spawning any
operations, and hold it for the duration of the program.
This is tricky, though, with the way the pid-file interacts
with the daemonize() process. Other processes will check
that the pid recorded in the pid-file still exists. But
detaching causes us to fork and continue running under a
new pid. So if we take the lock before detaching, the
pid-file will have a bogus pid in it. We'd have to go back
and update it with the new pid after detaching. We'd also
have to play some tricks with the tempfile subsystem to
tweak the "owner" field, so that the parent process does not
clean it up on exit, but the child process does.
Instead, we can do something a bit simpler: take the lock
only for the duration of the pre-detach work, then detach,
then take it again for the post-detach work. Technically,
this means that the post-detach lock could lose to another
process doing pre-detach work. But in the long run this
works out.
That second process would then follow-up by doing
post-detach work. Unless it was in turn blocked by a third
process doing pre-detach work, and so on. This could in
theory go on indefinitely, as the pre-detach work does not
repack, and so need_to_gc() will continue to trigger. But
in each round we are racing between the pre- and post-detach
locks. Eventually, one of the post-detach locks will win the
race and complete the full gc. So in the worst case, we may
racily repeat the pre-detach work, but we would never do so
simultaneously (it would happen via a sequence of serialized
race-wins).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-07-11 09:06:35 +00:00
|
|
|
|
|
|
|
# now fake a concurrent gc that holds the lock; we can use our
|
|
|
|
# shell pid so that it looks valid.
|
|
|
|
hostname=$(hostname || echo unknown) &&
|
2019-05-07 21:51:30 +00:00
|
|
|
shell_pid=$$ &&
|
|
|
|
if test_have_prereq MINGW && test -f /proc/$shell_pid/winpid
|
|
|
|
then
|
|
|
|
# In Git for Windows, Bash (actually, the MSYS2 runtime) has a
|
|
|
|
# different idea of PIDs than git.exe (actually Windows). Use
|
|
|
|
# the Windows PID in this case.
|
|
|
|
shell_pid=$(cat /proc/$shell_pid/winpid)
|
|
|
|
fi &&
|
|
|
|
printf "%d %s" "$shell_pid" "$hostname" >.git/gc.pid &&
|
gc: run pre-detach operations under lock
We normally try to avoid having two auto-gc operations run
at the same time, because it wastes resources. This was done
long ago in 64a99eb47 (gc: reject if another gc is running,
unless --force is given, 2013-08-08).
When we do a detached auto-gc, we run the ref-related
commands _before_ detaching, to avoid confusing lock
contention. This was done by 62aad1849 (gc --auto: do not
lock refs in the background, 2014-05-25).
These two features do not interact well. The pre-detach
operations are run before we check the gc.pid lock, meaning
that on a busy repository we may run many of them
concurrently. Ideally we'd take the lock before spawning any
operations, and hold it for the duration of the program.
This is tricky, though, with the way the pid-file interacts
with the daemonize() process. Other processes will check
that the pid recorded in the pid-file still exists. But
detaching causes us to fork and continue running under a
new pid. So if we take the lock before detaching, the
pid-file will have a bogus pid in it. We'd have to go back
and update it with the new pid after detaching. We'd also
have to play some tricks with the tempfile subsystem to
tweak the "owner" field, so that the parent process does not
clean it up on exit, but the child process does.
Instead, we can do something a bit simpler: take the lock
only for the duration of the pre-detach work, then detach,
then take it again for the post-detach work. Technically,
this means that the post-detach lock could lose to another
process doing pre-detach work. But in the long run this
works out.
That second process would then follow-up by doing
post-detach work. Unless it was in turn blocked by a third
process doing pre-detach work, and so on. This could in
theory go on indefinitely, as the pre-detach work does not
repack, and so need_to_gc() will continue to trigger. But
in each round we are racing between the pre- and post-detach
locks. Eventually, one of the post-detach locks will win the
race and complete the full gc. So in the worst case, we may
racily repeat the pre-detach work, but we would never do so
simultaneously (it would happen via a sequence of serialized
race-wins).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-07-11 09:06:35 +00:00
|
|
|
|
|
|
|
# our gc should exit zero without doing anything
|
|
|
|
run_and_wait_for_auto_gc &&
|
2021-08-02 16:53:35 +00:00
|
|
|
(ls -1 .git/refs/heads .git/reftable >actual || true) &&
|
|
|
|
test_cmp expect actual
|
gc: run pre-detach operations under lock
We normally try to avoid having two auto-gc operations run
at the same time, because it wastes resources. This was done
long ago in 64a99eb47 (gc: reject if another gc is running,
unless --force is given, 2013-08-08).
When we do a detached auto-gc, we run the ref-related
commands _before_ detaching, to avoid confusing lock
contention. This was done by 62aad1849 (gc --auto: do not
lock refs in the background, 2014-05-25).
These two features do not interact well. The pre-detach
operations are run before we check the gc.pid lock, meaning
that on a busy repository we may run many of them
concurrently. Ideally we'd take the lock before spawning any
operations, and hold it for the duration of the program.
This is tricky, though, with the way the pid-file interacts
with the daemonize() process. Other processes will check
that the pid recorded in the pid-file still exists. But
detaching causes us to fork and continue running under a
new pid. So if we take the lock before detaching, the
pid-file will have a bogus pid in it. We'd have to go back
and update it with the new pid after detaching. We'd also
have to play some tricks with the tempfile subsystem to
tweak the "owner" field, so that the parent process does not
clean it up on exit, but the child process does.
Instead, we can do something a bit simpler: take the lock
only for the duration of the pre-detach work, then detach,
then take it again for the post-detach work. Technically,
this means that the post-detach lock could lose to another
process doing pre-detach work. But in the long run this
works out.
That second process would then follow-up by doing
post-detach work. Unless it was in turn blocked by a third
process doing pre-detach work, and so on. This could in
theory go on indefinitely, as the pre-detach work does not
repack, and so need_to_gc() will continue to trigger. But
in each round we are racing between the pre- and post-detach
locks. Eventually, one of the post-detach locks will win the
race and complete the full gc. So in the worst case, we may
racily repeat the pre-detach work, but we would never do so
simultaneously (it would happen via a sequence of serialized
race-wins).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-07-11 09:06:35 +00:00
|
|
|
'
|
|
|
|
|
t6500: wait for detached auto gc at the end of the test script
The last test in 't6500-gc', 'background auto gc does not run if
gc.log is present and recent but does if it is old', added in
a831c06a2 (gc: ignore old gc.log files, 2017-02-10), may sporadically
trigger an error message from the test harness:
rm: cannot remove 'trash directory.t6500-gc/.git/objects': Directory not empty
The test in question ends with executing an auto gc in the backround,
which occasionally takes so long that it's still running when
'test_done' is about to remove the trash directory. This 'rm -rf
$trash' in the foreground might race with the detached auto gc to
create and delete files and directories, and gc might (re-)create a
path that 'rm' already visited and removed, triggering the above error
message when 'rm' attempts to remove its parent directory.
Commit bb05510e5 (t5510: run auto-gc in the foreground, 2016-05-01)
fixed the same problem in a different test script by simply
disallowing background gc. Unfortunately, what worked there is not
applicable here, because the purpose of this test is to check the
behavior of a detached auto gc.
Make sure that the test doesn't continue before the gc is finished in
the background with a clever bit of shell trickery:
- Open fd 9 in the shell, to be inherited by the background gc
process, because our daemonize() only closes the standard fds 0,
1 and 2.
- Duplicate this fd 9 to stdout.
- Read 'git gc's stdout, and thus fd 9, through a command
substitution. We don't actually care about gc's output, but this
construct has two useful properties:
- This read blocks until stdout or fd 9 are open. While stdout is
closed after the main gc process creates the background process
and exits, fd 9 remains open until the backround process exits.
- The variable assignment from the command substitution gets its
exit status from the command executed within the command
substitution, i.e. a failing main gc process will cause the test
to fail.
Note, that this fd trickery doesn't work on Windows, because due to
MSYS limitations the git process only inherits the standard fds 0, 1
and 2 from the shell. Luckily, it doesn't matter in this case,
because on Windows daemonize() is basically a noop, thus 'git gc
--auto' always runs in the foreground.
And since we can now continue the test reliably after the detached gc
finished, check that there is only a single packfile left at the end,
i.e. that the detached gc actually did what it was supposed to do.
Also add a comment at the end of the test script to warn developers of
future tests about this issue of long running detached gc processes.
Helped-by: Jeff King <peff@peff.net>
Helped-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-04-13 10:31:38 +00:00
|
|
|
# DO NOT leave a detached auto gc process running near the end of the
|
|
|
|
# test script: it can run long enough in the background to racily
|
|
|
|
# interfere with the cleanup in 'test_done'.
|
|
|
|
|
2010-10-22 06:47:19 +00:00
|
|
|
test_done
|