mirror of
https://github.com/git/git
synced 2024-11-05 18:59:29 +00:00
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by their modified time. Second, walk those pack-files from oldest to newest, compute their expected size, and add the packs to a list if they are smaller than the given batch-size. Stop when the total expected size is at least the batch size. If the batch size is zero, select all packs in the multi-pack-index. Finally, collect the objects from the multi-pack-index that are in the selected packs and send them to 'git pack-objects'. Write a new multi-pack-index that includes the new pack. Using a batch size of zero is very similar to a standard 'git repack' command, except that we do not delete the old packs and instead rely on the new multi-pack-index to prevent new processes from reading the old packs. This does not disrupt other Git processes that are currently reading the old packs based on the old multi-pack-index. While first designing a 'git multi-pack-index repack' operation, I started by collecting the batches based on the actual size of the objects instead of the size of the pack-files. This allows repacking a large pack-file that has very few referencd objects. However, this came at a significant cost of parsing pack-files instead of simply reading the multi-pack-index and getting the file information for the pack-files. The "expected size" version provides similar behavior, but could skip a pack-file if the average object size is much larger than the actual size of the referenced objects, or can create a large pack if the actual size of the referenced objects is larger than the expected size. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
parent
2af890bb28
commit
ce1e4a105b
2 changed files with 178 additions and 1 deletions
151
midx.c
151
midx.c
|
@ -9,6 +9,7 @@
|
||||||
#include "midx.h"
|
#include "midx.h"
|
||||||
#include "progress.h"
|
#include "progress.h"
|
||||||
#include "trace2.h"
|
#include "trace2.h"
|
||||||
|
#include "run-command.h"
|
||||||
|
|
||||||
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
|
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
|
||||||
#define MIDX_VERSION 1
|
#define MIDX_VERSION 1
|
||||||
|
@ -1227,7 +1228,155 @@ int expire_midx_packs(struct repository *r, const char *object_dir)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
|
struct repack_info {
|
||||||
|
timestamp_t mtime;
|
||||||
|
uint32_t referenced_objects;
|
||||||
|
uint32_t pack_int_id;
|
||||||
|
};
|
||||||
|
|
||||||
|
static int compare_by_mtime(const void *a_, const void *b_)
|
||||||
{
|
{
|
||||||
|
const struct repack_info *a, *b;
|
||||||
|
|
||||||
|
a = (const struct repack_info *)a_;
|
||||||
|
b = (const struct repack_info *)b_;
|
||||||
|
|
||||||
|
if (a->mtime < b->mtime)
|
||||||
|
return -1;
|
||||||
|
if (a->mtime > b->mtime)
|
||||||
|
return 1;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int fill_included_packs_all(struct multi_pack_index *m,
|
||||||
|
unsigned char *include_pack)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
for (i = 0; i < m->num_packs; i++)
|
||||||
|
include_pack[i] = 1;
|
||||||
|
|
||||||
|
return m->num_packs < 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int fill_included_packs_batch(struct repository *r,
|
||||||
|
struct multi_pack_index *m,
|
||||||
|
unsigned char *include_pack,
|
||||||
|
size_t batch_size)
|
||||||
|
{
|
||||||
|
uint32_t i, packs_to_repack;
|
||||||
|
size_t total_size;
|
||||||
|
struct repack_info *pack_info = xcalloc(m->num_packs, sizeof(struct repack_info));
|
||||||
|
|
||||||
|
for (i = 0; i < m->num_packs; i++) {
|
||||||
|
pack_info[i].pack_int_id = i;
|
||||||
|
|
||||||
|
if (prepare_midx_pack(r, m, i))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
pack_info[i].mtime = m->packs[i]->mtime;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; batch_size && i < m->num_objects; i++) {
|
||||||
|
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
|
||||||
|
pack_info[pack_int_id].referenced_objects++;
|
||||||
|
}
|
||||||
|
|
||||||
|
QSORT(pack_info, m->num_packs, compare_by_mtime);
|
||||||
|
|
||||||
|
total_size = 0;
|
||||||
|
packs_to_repack = 0;
|
||||||
|
for (i = 0; total_size < batch_size && i < m->num_packs; i++) {
|
||||||
|
int pack_int_id = pack_info[i].pack_int_id;
|
||||||
|
struct packed_git *p = m->packs[pack_int_id];
|
||||||
|
size_t expected_size;
|
||||||
|
|
||||||
|
if (!p)
|
||||||
|
continue;
|
||||||
|
if (open_pack_index(p) || !p->num_objects)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
expected_size = (size_t)(p->pack_size
|
||||||
|
* pack_info[i].referenced_objects);
|
||||||
|
expected_size /= p->num_objects;
|
||||||
|
|
||||||
|
if (expected_size >= batch_size)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
packs_to_repack++;
|
||||||
|
total_size += expected_size;
|
||||||
|
include_pack[pack_int_id] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(pack_info);
|
||||||
|
|
||||||
|
if (total_size < batch_size || packs_to_repack < 2)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
|
||||||
|
{
|
||||||
|
int result = 0;
|
||||||
|
uint32_t i;
|
||||||
|
unsigned char *include_pack;
|
||||||
|
struct child_process cmd = CHILD_PROCESS_INIT;
|
||||||
|
struct strbuf base_name = STRBUF_INIT;
|
||||||
|
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
|
||||||
|
|
||||||
|
if (!m)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
include_pack = xcalloc(m->num_packs, sizeof(unsigned char));
|
||||||
|
|
||||||
|
if (batch_size) {
|
||||||
|
if (fill_included_packs_batch(r, m, include_pack, batch_size))
|
||||||
|
goto cleanup;
|
||||||
|
} else if (fill_included_packs_all(m, include_pack))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
argv_array_push(&cmd.args, "pack-objects");
|
||||||
|
|
||||||
|
strbuf_addstr(&base_name, object_dir);
|
||||||
|
strbuf_addstr(&base_name, "/pack/pack");
|
||||||
|
argv_array_push(&cmd.args, base_name.buf);
|
||||||
|
strbuf_release(&base_name);
|
||||||
|
|
||||||
|
cmd.git_cmd = 1;
|
||||||
|
cmd.in = cmd.out = -1;
|
||||||
|
|
||||||
|
if (start_command(&cmd)) {
|
||||||
|
error(_("could not start pack-objects"));
|
||||||
|
result = 1;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < m->num_objects; i++) {
|
||||||
|
struct object_id oid;
|
||||||
|
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
|
||||||
|
|
||||||
|
if (!include_pack[pack_int_id])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
nth_midxed_object_oid(&oid, m, i);
|
||||||
|
xwrite(cmd.in, oid_to_hex(&oid), the_hash_algo->hexsz);
|
||||||
|
xwrite(cmd.in, "\n", 1);
|
||||||
|
}
|
||||||
|
close(cmd.in);
|
||||||
|
|
||||||
|
if (finish_command(&cmd)) {
|
||||||
|
error(_("could not finish pack-objects"));
|
||||||
|
result = 1;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
result = write_midx_internal(object_dir, m, NULL);
|
||||||
|
m = NULL;
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if (m)
|
||||||
|
close_midx(m);
|
||||||
|
free(include_pack);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
|
@ -450,4 +450,32 @@ test_expect_success 'repack with minimum size does not alter existing packs' '
|
||||||
)
|
)
|
||||||
'
|
'
|
||||||
|
|
||||||
|
test_expect_success 'repack creates a new pack' '
|
||||||
|
(
|
||||||
|
cd dup &&
|
||||||
|
ls .git/objects/pack/*idx >idx-list &&
|
||||||
|
test_line_count = 5 idx-list &&
|
||||||
|
THIRD_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 3 | tail -n 1) &&
|
||||||
|
BATCH_SIZE=$(($THIRD_SMALLEST_SIZE + 1)) &&
|
||||||
|
git multi-pack-index repack --batch-size=$BATCH_SIZE &&
|
||||||
|
ls .git/objects/pack/*idx >idx-list &&
|
||||||
|
test_line_count = 6 idx-list &&
|
||||||
|
test-tool read-midx .git/objects | grep idx >midx-list &&
|
||||||
|
test_line_count = 6 midx-list
|
||||||
|
)
|
||||||
|
'
|
||||||
|
|
||||||
|
test_expect_success 'expire removes repacked packs' '
|
||||||
|
(
|
||||||
|
cd dup &&
|
||||||
|
ls -al .git/objects/pack/*pack &&
|
||||||
|
ls -S .git/objects/pack/*pack | head -n 4 >expect &&
|
||||||
|
git multi-pack-index expire &&
|
||||||
|
ls -S .git/objects/pack/*pack >actual &&
|
||||||
|
test_cmp expect actual &&
|
||||||
|
test-tool read-midx .git/objects | grep idx >midx-list &&
|
||||||
|
test_line_count = 4 midx-list
|
||||||
|
)
|
||||||
|
'
|
||||||
|
|
||||||
test_done
|
test_done
|
||||||
|
|
Loading…
Reference in a new issue