2018-07-12 19:39:23 +00:00
|
|
|
#include "test-tool.h"
|
2023-02-24 00:09:27 +00:00
|
|
|
#include "hex.h"
|
2018-07-12 19:39:23 +00:00
|
|
|
#include "midx.h"
|
|
|
|
#include "repository.h"
|
2023-05-16 06:34:06 +00:00
|
|
|
#include "object-store-ll.h"
|
2021-09-29 01:55:20 +00:00
|
|
|
#include "pack-bitmap.h"
|
2023-04-22 20:17:10 +00:00
|
|
|
#include "packfile.h"
|
2023-03-21 06:26:05 +00:00
|
|
|
#include "setup.h"
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
#include "gettext.h"
|
2018-07-12 19:39:23 +00:00
|
|
|
|
2021-03-30 15:04:07 +00:00
|
|
|
static int read_midx_file(const char *object_dir, int show_objects)
|
2018-07-12 19:39:23 +00:00
|
|
|
{
|
2018-07-12 19:39:28 +00:00
|
|
|
uint32_t i;
|
2020-08-17 14:04:48 +00:00
|
|
|
struct multi_pack_index *m;
|
|
|
|
|
|
|
|
setup_git_directory();
|
|
|
|
m = load_multi_pack_index(object_dir, 1);
|
2018-07-12 19:39:23 +00:00
|
|
|
|
|
|
|
if (!m)
|
|
|
|
return 1;
|
|
|
|
|
2020-08-17 14:04:48 +00:00
|
|
|
printf("header: %08x %d %d %d %d\n",
|
2018-07-12 19:39:23 +00:00
|
|
|
m->signature,
|
|
|
|
m->version,
|
2020-08-17 14:04:48 +00:00
|
|
|
m->hash_len,
|
2018-07-12 19:39:23 +00:00
|
|
|
m->num_chunks,
|
|
|
|
m->num_packs);
|
|
|
|
|
2018-07-12 19:39:27 +00:00
|
|
|
printf("chunks:");
|
|
|
|
|
|
|
|
if (m->chunk_pack_names)
|
|
|
|
printf(" pack-names");
|
2018-07-12 19:39:31 +00:00
|
|
|
if (m->chunk_oid_fanout)
|
|
|
|
printf(" oid-fanout");
|
2018-07-12 19:39:30 +00:00
|
|
|
if (m->chunk_oid_lookup)
|
|
|
|
printf(" oid-lookup");
|
2018-07-12 19:39:32 +00:00
|
|
|
if (m->chunk_object_offsets)
|
|
|
|
printf(" object-offsets");
|
|
|
|
if (m->chunk_large_offsets)
|
|
|
|
printf(" large-offsets");
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2018-07-12 19:39:31 +00:00
|
|
|
printf("\nnum_objects: %d\n", m->num_objects);
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2018-07-12 19:39:28 +00:00
|
|
|
printf("packs:\n");
|
|
|
|
for (i = 0; i < m->num_packs; i++)
|
|
|
|
printf("%s\n", m->pack_names[i]);
|
|
|
|
|
2018-07-12 19:39:23 +00:00
|
|
|
printf("object-dir: %s\n", m->object_dir);
|
|
|
|
|
2021-03-30 15:04:07 +00:00
|
|
|
if (show_objects) {
|
|
|
|
struct object_id oid;
|
|
|
|
struct pack_entry e;
|
|
|
|
|
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
nth_midxed_object_oid(&oid, m, i);
|
|
|
|
fill_midx_entry(the_repository, &oid, &e, m);
|
|
|
|
|
|
|
|
printf("%s %"PRIu64"\t%s\n",
|
|
|
|
oid_to_hex(&oid), e.offset, e.p->pack_name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-26 21:01:11 +00:00
|
|
|
close_midx(m);
|
|
|
|
|
2018-07-12 19:39:23 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-31 20:52:28 +00:00
|
|
|
static int read_midx_checksum(const char *object_dir)
|
|
|
|
{
|
|
|
|
struct multi_pack_index *m;
|
|
|
|
|
|
|
|
setup_git_directory();
|
|
|
|
m = load_multi_pack_index(object_dir, 1);
|
|
|
|
if (!m)
|
|
|
|
return 1;
|
|
|
|
printf("%s\n", hash_to_hex(get_midx_checksum(m)));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-09-29 01:55:20 +00:00
|
|
|
static int read_midx_preferred_pack(const char *object_dir)
|
|
|
|
{
|
|
|
|
struct multi_pack_index *midx = NULL;
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
uint32_t preferred_pack;
|
2021-09-29 01:55:20 +00:00
|
|
|
|
|
|
|
setup_git_directory();
|
|
|
|
|
|
|
|
midx = load_multi_pack_index(object_dir, 1);
|
|
|
|
if (!midx)
|
|
|
|
return 1;
|
|
|
|
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
if (midx_preferred_pack(midx, &preferred_pack) < 0) {
|
|
|
|
warning(_("could not determine MIDX preferred pack"));
|
2021-10-07 02:24:40 +00:00
|
|
|
return 1;
|
|
|
|
}
|
2021-09-29 01:55:20 +00:00
|
|
|
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
printf("%s\n", midx->pack_names[preferred_pack]);
|
2021-09-29 01:55:20 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
static int read_midx_bitmapped_packs(const char *object_dir)
|
|
|
|
{
|
|
|
|
struct multi_pack_index *midx = NULL;
|
|
|
|
struct bitmapped_pack pack;
|
|
|
|
uint32_t i;
|
|
|
|
|
|
|
|
setup_git_directory();
|
|
|
|
|
|
|
|
midx = load_multi_pack_index(object_dir, 1);
|
|
|
|
if (!midx)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
for (i = 0; i < midx->num_packs; i++) {
|
|
|
|
if (nth_bitmapped_pack(the_repository, midx, &pack, i) < 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
printf("%s\n", pack_basename(pack.p));
|
|
|
|
printf(" bitmap_pos: %"PRIuMAX"\n", (uintmax_t)pack.bitmap_pos);
|
|
|
|
printf(" bitmap_nr: %"PRIuMAX"\n", (uintmax_t)pack.bitmap_nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
close_midx(midx);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-12 19:39:23 +00:00
|
|
|
int cmd__read_midx(int argc, const char **argv)
|
|
|
|
{
|
2021-03-30 15:04:07 +00:00
|
|
|
if (!(argc == 2 || argc == 3))
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
usage("read-midx [--show-objects|--checksum|--preferred-pack|--bitmap] <object-dir>");
|
2018-07-12 19:39:23 +00:00
|
|
|
|
2021-03-30 15:04:07 +00:00
|
|
|
if (!strcmp(argv[1], "--show-objects"))
|
|
|
|
return read_midx_file(argv[2], 1);
|
2021-08-31 20:52:28 +00:00
|
|
|
else if (!strcmp(argv[1], "--checksum"))
|
|
|
|
return read_midx_checksum(argv[2]);
|
2021-09-29 01:55:20 +00:00
|
|
|
else if (!strcmp(argv[1], "--preferred-pack"))
|
|
|
|
return read_midx_preferred_pack(argv[2]);
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
else if (!strcmp(argv[1], "--bitmap"))
|
|
|
|
return read_midx_bitmapped_packs(argv[2]);
|
2021-03-30 15:04:07 +00:00
|
|
|
return read_midx_file(argv[1], 0);
|
2018-07-12 19:39:23 +00:00
|
|
|
}
|