2023-04-22 20:17:23 +00:00
|
|
|
#include "git-compat-util.h"
|
2018-07-12 19:39:33 +00:00
|
|
|
#include "config.h"
|
2018-07-12 19:39:26 +00:00
|
|
|
#include "dir.h"
|
2023-02-24 00:09:27 +00:00
|
|
|
#include "hex.h"
|
2018-07-12 19:39:26 +00:00
|
|
|
#include "packfile.h"
|
2023-04-11 07:41:53 +00:00
|
|
|
#include "object-file.h"
|
2020-12-31 11:56:23 +00:00
|
|
|
#include "hash-lookup.h"
|
2018-07-12 19:39:21 +00:00
|
|
|
#include "midx.h"
|
2018-09-13 18:02:26 +00:00
|
|
|
#include "progress.h"
|
2019-03-21 19:36:13 +00:00
|
|
|
#include "trace2.h"
|
2021-02-18 14:07:33 +00:00
|
|
|
#include "chunk-format.h"
|
2021-08-31 20:52:24 +00:00
|
|
|
#include "pack-bitmap.h"
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
#include "pack-revindex.h"
|
2018-07-12 19:39:21 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
int midx_checksum_valid(struct multi_pack_index *m);
|
|
|
|
void clear_midx_files_ext(const char *object_dir, const char *ext,
|
|
|
|
unsigned char *keep_hash);
|
|
|
|
int cmp_idx_or_pack_name(const char *idx_or_pack_name,
|
|
|
|
const char *idx_name);
|
2019-06-10 23:35:25 +00:00
|
|
|
|
2021-08-31 20:52:21 +00:00
|
|
|
const unsigned char *get_midx_checksum(struct multi_pack_index *m)
|
pack-revindex: read multi-pack reverse indexes
Implement reading for multi-pack reverse indexes, as described in the
previous patch.
Note that these functions don't yet have any callers, and won't until
multi-pack reachability bitmaps are introduced in a later patch series.
In the meantime, this patch implements some of the infrastructure
necessary to support multi-pack bitmaps.
There are three new functions exposed by the revindex API:
- load_midx_revindex(): loads the reverse index corresponding to the
given multi-pack index.
- midx_to_pack_pos() and pack_pos_to_midx(): these convert between the
multi-pack index and pseudo-pack order.
load_midx_revindex() and pack_pos_to_midx() are both relatively
straightforward.
load_midx_revindex() needs a few functions to be exposed from the midx
API. One to get the checksum of a midx, and another to get the .rev's
filename. Similar to recent changes in the packed_git struct, three new
fields are added to the multi_pack_index struct: one to keep track of
the size, one to keep track of the mmap'd pointer, and another to point
past the header and at the reverse index's data.
pack_pos_to_midx() simply reads the corresponding entry out of the
table.
midx_to_pack_pos() is the trickiest, since it needs to find an object's
position in the psuedo-pack order, but that order can only be recovered
in the .rev file itself. This mapping can be implemented with a binary
search, but note that the thing we're binary searching over isn't an
array of values, but rather a permuted order of those values.
So, when comparing two items, it's helpful to keep in mind the
difference. Instead of a traditional binary search, where you are
comparing two things directly, here we're comparing a (pack, offset)
tuple with an index into the multi-pack index. That index describes
another (pack, offset) tuple, and it is _those_ two tuples that are
compared.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 15:04:26 +00:00
|
|
|
{
|
|
|
|
return m->data + m->data_len - the_hash_algo->rawsz;
|
|
|
|
}
|
|
|
|
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
void get_midx_filename(struct strbuf *out, const char *object_dir)
|
2018-07-12 19:39:22 +00:00
|
|
|
{
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
strbuf_addf(out, "%s/pack/multi-pack-index", object_dir);
|
2018-07-12 19:39:22 +00:00
|
|
|
}
|
|
|
|
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
void get_midx_rev_filename(struct strbuf *out, struct multi_pack_index *m)
|
pack-revindex: read multi-pack reverse indexes
Implement reading for multi-pack reverse indexes, as described in the
previous patch.
Note that these functions don't yet have any callers, and won't until
multi-pack reachability bitmaps are introduced in a later patch series.
In the meantime, this patch implements some of the infrastructure
necessary to support multi-pack bitmaps.
There are three new functions exposed by the revindex API:
- load_midx_revindex(): loads the reverse index corresponding to the
given multi-pack index.
- midx_to_pack_pos() and pack_pos_to_midx(): these convert between the
multi-pack index and pseudo-pack order.
load_midx_revindex() and pack_pos_to_midx() are both relatively
straightforward.
load_midx_revindex() needs a few functions to be exposed from the midx
API. One to get the checksum of a midx, and another to get the .rev's
filename. Similar to recent changes in the packed_git struct, three new
fields are added to the multi_pack_index struct: one to keep track of
the size, one to keep track of the mmap'd pointer, and another to point
past the header and at the reverse index's data.
pack_pos_to_midx() simply reads the corresponding entry out of the
table.
midx_to_pack_pos() is the trickiest, since it needs to find an object's
position in the psuedo-pack order, but that order can only be recovered
in the .rev file itself. This mapping can be implemented with a binary
search, but note that the thing we're binary searching over isn't an
array of values, but rather a permuted order of those values.
So, when comparing two items, it's helpful to keep in mind the
difference. Instead of a traditional binary search, where you are
comparing two things directly, here we're comparing a (pack, offset)
tuple with an index into the multi-pack index. That index describes
another (pack, offset) tuple, and it is _those_ two tuples that are
compared.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 15:04:26 +00:00
|
|
|
{
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
get_midx_filename(out, m->object_dir);
|
|
|
|
strbuf_addf(out, "-%s.rev", hash_to_hex(get_midx_checksum(m)));
|
pack-revindex: read multi-pack reverse indexes
Implement reading for multi-pack reverse indexes, as described in the
previous patch.
Note that these functions don't yet have any callers, and won't until
multi-pack reachability bitmaps are introduced in a later patch series.
In the meantime, this patch implements some of the infrastructure
necessary to support multi-pack bitmaps.
There are three new functions exposed by the revindex API:
- load_midx_revindex(): loads the reverse index corresponding to the
given multi-pack index.
- midx_to_pack_pos() and pack_pos_to_midx(): these convert between the
multi-pack index and pseudo-pack order.
load_midx_revindex() and pack_pos_to_midx() are both relatively
straightforward.
load_midx_revindex() needs a few functions to be exposed from the midx
API. One to get the checksum of a midx, and another to get the .rev's
filename. Similar to recent changes in the packed_git struct, three new
fields are added to the multi_pack_index struct: one to keep track of
the size, one to keep track of the mmap'd pointer, and another to point
past the header and at the reverse index's data.
pack_pos_to_midx() simply reads the corresponding entry out of the
table.
midx_to_pack_pos() is the trickiest, since it needs to find an object's
position in the psuedo-pack order, but that order can only be recovered
in the .rev file itself. This mapping can be implemented with a binary
search, but note that the thing we're binary searching over isn't an
array of values, but rather a permuted order of those values.
So, when comparing two items, it's helpful to keep in mind the
difference. Instead of a traditional binary search, where you are
comparing two things directly, here we're comparing a (pack, offset)
tuple with an index into the multi-pack index. That index describes
another (pack, offset) tuple, and it is _those_ two tuples that are
compared.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 15:04:26 +00:00
|
|
|
}
|
|
|
|
|
2021-02-18 14:07:36 +00:00
|
|
|
static int midx_read_oid_fanout(const unsigned char *chunk_start,
|
|
|
|
size_t chunk_size, void *data)
|
|
|
|
{
|
midx: check consistency of fanout table
The commit-graph, midx, and pack idx on-disk formats all have oid fanout
tables which are fed to bsearch_hash(). If these tables do not increase
monotonically, then the binary search may not only produce bogus values,
it may cause out of bounds reads.
We fixed this for commit graphs in 4169d89645 (commit-graph: check
consistency of fanout table, 2023-10-09). That commit argued that we did
not need to do the same for midx and pack idx files, because they
already did this check. However, that is wrong. We _do_ check the fanout
table for pack idx files when we load them, but we only do so for midx
files when running "git multi-pack-index verify". So it is possible to
get an out-of-bounds read by running a normal command with a specially
crafted midx file.
Let's fix this using the same solution (and roughly the same test) we
did for the commit-graph in 4169d89645. This replaces the same check
from "multi-pack-index verify", because verify uses the same read
routines, we'd bail on reading the midx much sooner now. So let's make
sure to copy its verbose error message.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-11-09 07:12:07 +00:00
|
|
|
int i;
|
2021-02-18 14:07:36 +00:00
|
|
|
struct multi_pack_index *m = data;
|
|
|
|
m->chunk_oid_fanout = (uint32_t *)chunk_start;
|
|
|
|
|
|
|
|
if (chunk_size != 4 * 256) {
|
|
|
|
error(_("multi-pack-index OID fanout is of the wrong size"));
|
|
|
|
return 1;
|
|
|
|
}
|
midx: check consistency of fanout table
The commit-graph, midx, and pack idx on-disk formats all have oid fanout
tables which are fed to bsearch_hash(). If these tables do not increase
monotonically, then the binary search may not only produce bogus values,
it may cause out of bounds reads.
We fixed this for commit graphs in 4169d89645 (commit-graph: check
consistency of fanout table, 2023-10-09). That commit argued that we did
not need to do the same for midx and pack idx files, because they
already did this check. However, that is wrong. We _do_ check the fanout
table for pack idx files when we load them, but we only do so for midx
files when running "git multi-pack-index verify". So it is possible to
get an out-of-bounds read by running a normal command with a specially
crafted midx file.
Let's fix this using the same solution (and roughly the same test) we
did for the commit-graph in 4169d89645. This replaces the same check
from "multi-pack-index verify", because verify uses the same read
routines, we'd bail on reading the midx much sooner now. So let's make
sure to copy its verbose error message.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-11-09 07:12:07 +00:00
|
|
|
for (i = 0; i < 255; i++) {
|
|
|
|
uint32_t oid_fanout1 = ntohl(m->chunk_oid_fanout[i]);
|
|
|
|
uint32_t oid_fanout2 = ntohl(m->chunk_oid_fanout[i+1]);
|
|
|
|
|
|
|
|
if (oid_fanout1 > oid_fanout2) {
|
|
|
|
error(_("oid fanout out of order: fanout[%d] = %"PRIx32" > %"PRIx32" = fanout[%d]"),
|
|
|
|
i, oid_fanout1, oid_fanout2, i + 1);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
2023-10-09 21:02:03 +00:00
|
|
|
m->num_objects = ntohl(m->chunk_oid_fanout[255]);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int midx_read_oid_lookup(const unsigned char *chunk_start,
|
|
|
|
size_t chunk_size, void *data)
|
|
|
|
{
|
|
|
|
struct multi_pack_index *m = data;
|
|
|
|
m->chunk_oid_lookup = chunk_start;
|
|
|
|
|
|
|
|
if (chunk_size != st_mult(m->hash_len, m->num_objects)) {
|
|
|
|
error(_("multi-pack-index OID lookup chunk is the wrong size"));
|
|
|
|
return 1;
|
|
|
|
}
|
2021-02-18 14:07:36 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-10-09 21:05:27 +00:00
|
|
|
static int midx_read_object_offsets(const unsigned char *chunk_start,
|
|
|
|
size_t chunk_size, void *data)
|
|
|
|
{
|
|
|
|
struct multi_pack_index *m = data;
|
|
|
|
m->chunk_object_offsets = chunk_start;
|
|
|
|
|
|
|
|
if (chunk_size != st_mult(m->num_objects, MIDX_CHUNK_OFFSET_WIDTH)) {
|
|
|
|
error(_("multi-pack-index object offset chunk is the wrong size"));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz)
|
|
|
|
|
2018-08-20 16:51:55 +00:00
|
|
|
struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local)
|
2018-07-12 19:39:23 +00:00
|
|
|
{
|
|
|
|
struct multi_pack_index *m = NULL;
|
|
|
|
int fd;
|
|
|
|
struct stat st;
|
|
|
|
size_t midx_size;
|
|
|
|
void *midx_map = NULL;
|
|
|
|
uint32_t hash_version;
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
struct strbuf midx_name = STRBUF_INIT;
|
2018-07-12 19:39:27 +00:00
|
|
|
uint32_t i;
|
2018-07-12 19:39:28 +00:00
|
|
|
const char *cur_pack_name;
|
2021-02-18 14:07:36 +00:00
|
|
|
struct chunkfile *cf = NULL;
|
2018-07-12 19:39:23 +00:00
|
|
|
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
get_midx_filename(&midx_name, object_dir);
|
|
|
|
|
|
|
|
fd = git_open(midx_name.buf);
|
2018-07-12 19:39:23 +00:00
|
|
|
|
|
|
|
if (fd < 0)
|
|
|
|
goto cleanup_fail;
|
|
|
|
if (fstat(fd, &st)) {
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
error_errno(_("failed to read %s"), midx_name.buf);
|
2018-07-12 19:39:23 +00:00
|
|
|
goto cleanup_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
midx_size = xsize_t(st.st_size);
|
|
|
|
|
|
|
|
if (midx_size < MIDX_MIN_SIZE) {
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
error(_("multi-pack-index file %s is too small"), midx_name.buf);
|
2018-07-12 19:39:23 +00:00
|
|
|
goto cleanup_fail;
|
|
|
|
}
|
|
|
|
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
strbuf_release(&midx_name);
|
2018-07-12 19:39:23 +00:00
|
|
|
|
|
|
|
midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
2020-04-24 13:17:16 +00:00
|
|
|
close(fd);
|
2018-07-12 19:39:23 +00:00
|
|
|
|
2019-04-03 22:00:05 +00:00
|
|
|
FLEX_ALLOC_STR(m, object_dir, object_dir);
|
2018-07-12 19:39:23 +00:00
|
|
|
m->data = midx_map;
|
|
|
|
m->data_len = midx_size;
|
2018-08-20 16:51:55 +00:00
|
|
|
m->local = local;
|
2018-07-12 19:39:23 +00:00
|
|
|
|
|
|
|
m->signature = get_be32(m->data);
|
2018-09-13 18:02:15 +00:00
|
|
|
if (m->signature != MIDX_SIGNATURE)
|
|
|
|
die(_("multi-pack-index signature 0x%08x does not match signature 0x%08x"),
|
2018-07-12 19:39:23 +00:00
|
|
|
m->signature, MIDX_SIGNATURE);
|
|
|
|
|
|
|
|
m->version = m->data[MIDX_BYTE_FILE_VERSION];
|
2018-09-13 18:02:15 +00:00
|
|
|
if (m->version != MIDX_VERSION)
|
|
|
|
die(_("multi-pack-index version %d not recognized"),
|
2018-07-12 19:39:23 +00:00
|
|
|
m->version);
|
|
|
|
|
|
|
|
hash_version = m->data[MIDX_BYTE_HASH_VERSION];
|
2022-05-20 23:17:41 +00:00
|
|
|
if (hash_version != oid_version(the_hash_algo)) {
|
2020-08-17 14:04:48 +00:00
|
|
|
error(_("multi-pack-index hash version %u does not match version %u"),
|
2022-05-20 23:17:41 +00:00
|
|
|
hash_version, oid_version(the_hash_algo));
|
2020-08-17 14:04:48 +00:00
|
|
|
goto cleanup_fail;
|
|
|
|
}
|
2019-08-18 20:04:27 +00:00
|
|
|
m->hash_len = the_hash_algo->rawsz;
|
2018-07-12 19:39:23 +00:00
|
|
|
|
|
|
|
m->num_chunks = m->data[MIDX_BYTE_NUM_CHUNKS];
|
|
|
|
|
|
|
|
m->num_packs = get_be32(m->data + MIDX_BYTE_NUM_PACKS);
|
|
|
|
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
m->preferred_pack_idx = -1;
|
|
|
|
|
2021-02-18 14:07:36 +00:00
|
|
|
cf = init_chunkfile(NULL);
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2021-02-18 14:07:36 +00:00
|
|
|
if (read_table_of_contents(cf, m->data, midx_size,
|
midx: enforce chunk alignment on reading
The midx reader assumes chunks are aligned to a 4-byte boundary: we
treat the fanout chunk as an array of uint32_t, indexing it to feed the
results to ntohl(). Without aligning the chunks, we may violate the
CPU's alignment constraints. Though many platforms allow this, some do
not. And certanily UBSan will complain, since it is undefined behavior.
Even though most chunks are naturally 4-byte-aligned (because they are
storing uint32_t or larger types), PNAM is not. It stores NUL-terminated
pack names, so you can have a valid chunk with any length. The writing
side handles this by 4-byte-aligning the chunk, introducing a few extra
NULs as necessary. But since we don't check this on the reading side, we
may end up with a misaligned fanout and trigger the undefined behavior.
We have two options here:
1. Swap out ntohl(fanout[i]) for get_be32(fanout+i) everywhere. The
latter handles alignment itself. It's possible that it's slightly
slower (though in practice I'm not sure how true that is,
especially for these code paths which then go on to do a binary
search).
2. Enforce the alignment when reading the chunks. This is easy to do,
since the table-of-contents reader can check it in one spot.
I went with the second option here, just because it places less burden
on maintenance going forward (it is OK to continue using ntohl), and we
know it can't have any performance impact on the actual reads.
The commit-graph code uses the same chunk API. It's usually also 4-byte
aligned, but some chunks are not (like Bloom filter BDAT chunks). So
we'll pass "1" here to allow any alignment. It doesn't suffer from the
same problem as midx with its fanout because the fanout chunk is always
the first (and the rest of the format dictates that the first chunk will
start aligned).
The new test shows the effect on a midx with a misaligned PNAM chunk.
Note that the midx-reading code treats chunk-toc errors as soft, falling
back to the non-midx path rather than calling die(), as we do for other
parsing errors. Arguably we should make all of these behave the same,
but that's out of scope for this patch. For now the test just expects
the fallback behavior.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 21:05:23 +00:00
|
|
|
MIDX_HEADER_SIZE, m->num_chunks,
|
|
|
|
MIDX_CHUNK_ALIGNMENT))
|
2021-02-18 14:07:36 +00:00
|
|
|
goto cleanup_fail;
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2023-10-09 21:05:14 +00:00
|
|
|
if (pair_chunk(cf, MIDX_CHUNKID_PACKNAMES, &m->chunk_pack_names, &m->chunk_pack_names_len))
|
2023-10-09 20:59:19 +00:00
|
|
|
die(_("multi-pack-index required pack-name chunk missing or corrupted"));
|
|
|
|
if (read_chunk(cf, MIDX_CHUNKID_OIDFANOUT, midx_read_oid_fanout, m))
|
|
|
|
die(_("multi-pack-index required OID fanout chunk missing or corrupted"));
|
2023-10-09 21:02:03 +00:00
|
|
|
if (read_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, midx_read_oid_lookup, m))
|
2023-10-09 20:59:19 +00:00
|
|
|
die(_("multi-pack-index required OID lookup chunk missing or corrupted"));
|
2023-10-09 21:05:27 +00:00
|
|
|
if (read_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, midx_read_object_offsets, m))
|
2023-10-09 20:59:19 +00:00
|
|
|
die(_("multi-pack-index required object offsets chunk missing or corrupted"));
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2023-10-09 21:05:30 +00:00
|
|
|
pair_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets,
|
|
|
|
&m->chunk_large_offsets_len);
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
pair_chunk(cf, MIDX_CHUNKID_BITMAPPEDPACKS,
|
|
|
|
(const unsigned char **)&m->chunk_bitmapped_packs,
|
|
|
|
&m->chunk_bitmapped_packs_len);
|
2021-02-18 14:07:36 +00:00
|
|
|
|
midx: read `RIDX` chunk when present
When a MIDX contains the new `RIDX` chunk, ensure that the reverse index
is read from it instead of the on-disk .rev file. Since we need to
encode the object order in the MIDX itself for correctness reasons,
there is no point in storing the same data again outside of the MIDX.
So, this patch stops writing separate .rev files, and reads it out of
the MIDX itself. This is possible to do with relatively little new code,
since the format of the RIDX chunk is identical to the data in the .rev
file. In other words, we can implement this by pointing the
`revindex_data` field at the reverse index chunk of the MIDX instead of
the .rev file without any other changes.
Note that we have two knobs that are adjusted for the new tests:
GIT_TEST_MIDX_WRITE_REV and GIT_TEST_MIDX_READ_RIDX. The former controls
whether the MIDX .rev is written at all, and the latter controls whether
we read the MIDX's RIDX chunk.
Both are necessary to ensure that the test added at the beginning of
this series continues to work. This is because we always need to write
the RIDX chunk in the MIDX in order to change its checksum, but we want
to make sure reading the existing .rev file still works (since the RIDX
chunk takes precedence by default).
Arguably this isn't a very interesting mode to test, because the
precedence rules mean that we'll always read the RIDX chunk over the
.rev file. But it makes it impossible for a user to induce corruption in
their repository by adjusting the test knobs (since if we had an
either/or knob they could stop writing the RIDX chunk, allowing them to
tweak the MIDX's object order without changing its checksum).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Reviewed-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-01-25 22:41:17 +00:00
|
|
|
if (git_env_bool("GIT_TEST_MIDX_READ_RIDX", 1))
|
2023-10-09 21:05:33 +00:00
|
|
|
pair_chunk(cf, MIDX_CHUNKID_REVINDEX, &m->chunk_revindex,
|
|
|
|
&m->chunk_revindex_len);
|
midx: read `RIDX` chunk when present
When a MIDX contains the new `RIDX` chunk, ensure that the reverse index
is read from it instead of the on-disk .rev file. Since we need to
encode the object order in the MIDX itself for correctness reasons,
there is no point in storing the same data again outside of the MIDX.
So, this patch stops writing separate .rev files, and reads it out of
the MIDX itself. This is possible to do with relatively little new code,
since the format of the RIDX chunk is identical to the data in the .rev
file. In other words, we can implement this by pointing the
`revindex_data` field at the reverse index chunk of the MIDX instead of
the .rev file without any other changes.
Note that we have two knobs that are adjusted for the new tests:
GIT_TEST_MIDX_WRITE_REV and GIT_TEST_MIDX_READ_RIDX. The former controls
whether the MIDX .rev is written at all, and the latter controls whether
we read the MIDX's RIDX chunk.
Both are necessary to ensure that the test added at the beginning of
this series continues to work. This is because we always need to write
the RIDX chunk in the MIDX in order to change its checksum, but we want
to make sure reading the existing .rev file still works (since the RIDX
chunk takes precedence by default).
Arguably this isn't a very interesting mode to test, because the
precedence rules mean that we'll always read the RIDX chunk over the
.rev file. But it makes it impossible for a user to induce corruption in
their repository by adjusting the test knobs (since if we had an
either/or knob they could stop writing the RIDX chunk, allowing them to
tweak the MIDX's object order without changing its checksum).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Reviewed-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-01-25 22:41:17 +00:00
|
|
|
|
2021-03-13 16:17:22 +00:00
|
|
|
CALLOC_ARRAY(m->pack_names, m->num_packs);
|
|
|
|
CALLOC_ARRAY(m->packs, m->num_packs);
|
2018-07-12 19:39:28 +00:00
|
|
|
|
|
|
|
cur_pack_name = (const char *)m->chunk_pack_names;
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
2023-10-09 21:05:14 +00:00
|
|
|
const char *end;
|
|
|
|
size_t avail = m->chunk_pack_names_len -
|
|
|
|
(cur_pack_name - (const char *)m->chunk_pack_names);
|
|
|
|
|
2018-07-12 19:39:28 +00:00
|
|
|
m->pack_names[i] = cur_pack_name;
|
|
|
|
|
2023-10-09 21:05:14 +00:00
|
|
|
end = memchr(cur_pack_name, '\0', avail);
|
|
|
|
if (!end)
|
|
|
|
die(_("multi-pack-index pack-name chunk is too short"));
|
|
|
|
cur_pack_name = end + 1;
|
2018-07-12 19:39:28 +00:00
|
|
|
|
2018-09-13 18:02:18 +00:00
|
|
|
if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0)
|
|
|
|
die(_("multi-pack-index pack names out of order: '%s' before '%s'"),
|
2018-07-12 19:39:28 +00:00
|
|
|
m->pack_names[i - 1],
|
|
|
|
m->pack_names[i]);
|
|
|
|
}
|
|
|
|
|
2019-03-21 19:36:13 +00:00
|
|
|
trace2_data_intmax("midx", the_repository, "load/num_packs", m->num_packs);
|
|
|
|
trace2_data_intmax("midx", the_repository, "load/num_objects", m->num_objects);
|
|
|
|
|
2021-10-21 03:39:47 +00:00
|
|
|
free_chunkfile(cf);
|
2018-07-12 19:39:23 +00:00
|
|
|
return m;
|
|
|
|
|
|
|
|
cleanup_fail:
|
|
|
|
free(m);
|
midx.c: write MIDX filenames to strbuf
To ask for the name of a MIDX and its corresponding .rev file, callers
invoke get_midx_filename() and get_midx_rev_filename(), respectively.
These both invoke xstrfmt(), allocating a chunk of memory which must be
freed later on.
This makes callers in pack-bitmap.c somewhat awkward. Specifically,
midx_bitmap_filename(), which is implemented like:
return xstrfmt("%s-%s.bitmap",
get_midx_filename(midx->object_dir),
hash_to_hex(get_midx_checksum(midx)));
this leaks the second argument to xstrfmt(), which itself was allocated
with xstrfmt(). This caller could assign both the result of
get_midx_filename() and the outer xstrfmt() to a temporary variable,
remembering to free() the former before returning. But that involves a
wasteful copy.
Instead, get_midx_filename() and get_midx_rev_filename() take a strbuf
as an output parameter. This way midx_bitmap_filename() can manipulate
and pass around a temporary buffer which it detaches back to its caller.
That allows us to implement the function without copying or open-coding
get_midx_filename() in a way that doesn't leak.
Update the other callers of get_midx_filename() and
get_midx_rev_filename() accordingly.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-10-26 21:01:21 +00:00
|
|
|
strbuf_release(&midx_name);
|
2021-10-21 03:39:47 +00:00
|
|
|
free_chunkfile(cf);
|
2018-07-12 19:39:23 +00:00
|
|
|
if (midx_map)
|
|
|
|
munmap(midx_map, midx_size);
|
|
|
|
if (0 <= fd)
|
|
|
|
close(fd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-10-12 17:34:19 +00:00
|
|
|
void close_midx(struct multi_pack_index *m)
|
2018-07-12 19:39:36 +00:00
|
|
|
{
|
|
|
|
uint32_t i;
|
2018-10-12 17:34:19 +00:00
|
|
|
|
|
|
|
if (!m)
|
|
|
|
return;
|
|
|
|
|
2021-08-31 20:52:07 +00:00
|
|
|
close_midx(m->next);
|
|
|
|
|
2018-07-12 19:39:36 +00:00
|
|
|
munmap((unsigned char *)m->data, m->data_len);
|
|
|
|
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-29 16:18:56 +00:00
|
|
|
if (m->packs[i])
|
|
|
|
m->packs[i]->multi_pack_index = 0;
|
2018-07-12 19:39:36 +00:00
|
|
|
}
|
|
|
|
FREE_AND_NULL(m->packs);
|
|
|
|
FREE_AND_NULL(m->pack_names);
|
2021-08-31 20:52:07 +00:00
|
|
|
free(m);
|
2018-07-12 19:39:36 +00:00
|
|
|
}
|
|
|
|
|
2019-04-29 16:18:55 +00:00
|
|
|
int prepare_midx_pack(struct repository *r, struct multi_pack_index *m, uint32_t pack_int_id)
|
2018-07-12 19:39:34 +00:00
|
|
|
{
|
|
|
|
struct strbuf pack_name = STRBUF_INIT;
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-29 16:18:56 +00:00
|
|
|
struct packed_git *p;
|
2018-07-12 19:39:34 +00:00
|
|
|
|
|
|
|
if (pack_int_id >= m->num_packs)
|
2018-11-28 21:43:09 +00:00
|
|
|
die(_("bad pack-int-id: %u (%u total packs)"),
|
2018-09-13 18:02:25 +00:00
|
|
|
pack_int_id, m->num_packs);
|
2018-07-12 19:39:34 +00:00
|
|
|
|
|
|
|
if (m->packs[pack_int_id])
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
strbuf_addf(&pack_name, "%s/pack/%s", m->object_dir,
|
|
|
|
m->pack_names[pack_int_id]);
|
|
|
|
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-29 16:18:56 +00:00
|
|
|
p = add_packed_git(pack_name.buf, pack_name.len, m->local);
|
2018-07-12 19:39:34 +00:00
|
|
|
strbuf_release(&pack_name);
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-29 16:18:56 +00:00
|
|
|
|
|
|
|
if (!p)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
p->multi_pack_index = 1;
|
|
|
|
m->packs[pack_int_id] = p;
|
|
|
|
install_packed_git(r, p);
|
|
|
|
list_add_tail(&p->mru, &r->objects->packed_git_mru);
|
|
|
|
|
|
|
|
return 0;
|
2018-07-12 19:39:34 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
#define MIDX_CHUNK_BITMAPPED_PACKS_WIDTH (2 * sizeof(uint32_t))
|
|
|
|
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
int nth_bitmapped_pack(struct repository *r, struct multi_pack_index *m,
|
|
|
|
struct bitmapped_pack *bp, uint32_t pack_int_id)
|
|
|
|
{
|
|
|
|
if (!m->chunk_bitmapped_packs)
|
|
|
|
return error(_("MIDX does not contain the BTMP chunk"));
|
|
|
|
|
|
|
|
if (prepare_midx_pack(r, m, pack_int_id))
|
|
|
|
return error(_("could not load bitmapped pack %"PRIu32), pack_int_id);
|
|
|
|
|
|
|
|
bp->p = m->packs[pack_int_id];
|
|
|
|
bp->bitmap_pos = get_be32((char *)m->chunk_bitmapped_packs +
|
|
|
|
MIDX_CHUNK_BITMAPPED_PACKS_WIDTH * pack_int_id);
|
|
|
|
bp->bitmap_nr = get_be32((char *)m->chunk_bitmapped_packs +
|
|
|
|
MIDX_CHUNK_BITMAPPED_PACKS_WIDTH * pack_int_id +
|
|
|
|
sizeof(uint32_t));
|
|
|
|
bp->pack_int_id = pack_int_id;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-12 19:39:34 +00:00
|
|
|
int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32_t *result)
|
|
|
|
{
|
|
|
|
return bsearch_hash(oid->hash, m->chunk_oid_fanout, m->chunk_oid_lookup,
|
2019-08-18 20:04:27 +00:00
|
|
|
the_hash_algo->rawsz, result);
|
2018-07-12 19:39:34 +00:00
|
|
|
}
|
|
|
|
|
2018-07-12 19:39:35 +00:00
|
|
|
struct object_id *nth_midxed_object_oid(struct object_id *oid,
|
|
|
|
struct multi_pack_index *m,
|
|
|
|
uint32_t n)
|
|
|
|
{
|
|
|
|
if (n >= m->num_objects)
|
|
|
|
return NULL;
|
|
|
|
|
2023-07-12 23:37:38 +00:00
|
|
|
oidread(oid, m->chunk_oid_lookup + st_mult(m->hash_len, n));
|
2018-07-12 19:39:35 +00:00
|
|
|
return oid;
|
|
|
|
}
|
|
|
|
|
2021-03-30 15:04:20 +00:00
|
|
|
off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos)
|
2018-07-12 19:39:34 +00:00
|
|
|
{
|
|
|
|
const unsigned char *offset_data;
|
|
|
|
uint32_t offset32;
|
|
|
|
|
2021-02-18 14:07:37 +00:00
|
|
|
offset_data = m->chunk_object_offsets + (off_t)pos * MIDX_CHUNK_OFFSET_WIDTH;
|
2018-07-12 19:39:34 +00:00
|
|
|
offset32 = get_be32(offset_data + sizeof(uint32_t));
|
|
|
|
|
|
|
|
if (m->chunk_large_offsets && offset32 & MIDX_LARGE_OFFSET_NEEDED) {
|
2018-09-13 18:02:23 +00:00
|
|
|
if (sizeof(off_t) < sizeof(uint64_t))
|
2018-07-12 19:39:34 +00:00
|
|
|
die(_("multi-pack-index stores a 64-bit offset, but off_t is too small"));
|
|
|
|
|
|
|
|
offset32 ^= MIDX_LARGE_OFFSET_NEEDED;
|
2023-10-09 21:05:30 +00:00
|
|
|
if (offset32 >= m->chunk_large_offsets_len / sizeof(uint64_t))
|
|
|
|
die(_("multi-pack-index large offset out of bounds"));
|
|
|
|
return get_be64(m->chunk_large_offsets + sizeof(uint64_t) * offset32);
|
2018-07-12 19:39:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return offset32;
|
|
|
|
}
|
|
|
|
|
2021-03-30 15:04:20 +00:00
|
|
|
uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos)
|
2018-07-12 19:39:34 +00:00
|
|
|
{
|
2021-02-18 14:07:37 +00:00
|
|
|
return get_be32(m->chunk_object_offsets +
|
|
|
|
(off_t)pos * MIDX_CHUNK_OFFSET_WIDTH);
|
2018-07-12 19:39:34 +00:00
|
|
|
}
|
|
|
|
|
2022-10-12 22:01:48 +00:00
|
|
|
int fill_midx_entry(struct repository *r,
|
2021-09-11 20:39:31 +00:00
|
|
|
const struct object_id *oid,
|
|
|
|
struct pack_entry *e,
|
|
|
|
struct multi_pack_index *m)
|
2018-07-12 19:39:34 +00:00
|
|
|
{
|
2021-09-11 20:39:31 +00:00
|
|
|
uint32_t pos;
|
2018-07-12 19:39:34 +00:00
|
|
|
uint32_t pack_int_id;
|
|
|
|
struct packed_git *p;
|
|
|
|
|
2021-09-11 20:39:31 +00:00
|
|
|
if (!bsearch_midx(oid, m, &pos))
|
|
|
|
return 0;
|
|
|
|
|
2018-07-12 19:39:34 +00:00
|
|
|
if (pos >= m->num_objects)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pack_int_id = nth_midxed_pack_int_id(m, pos);
|
|
|
|
|
2019-04-29 16:18:55 +00:00
|
|
|
if (prepare_midx_pack(r, m, pack_int_id))
|
midx.c: protect against disappearing packs
When a packed object is stored in a multi-pack index, but that pack has
racily gone away, the MIDX code simply calls die(), when it could be
returning an error to the caller, which would in turn lead to
re-scanning the pack directory.
A pack can racily disappear, for example, due to a simultaneous 'git
repack -ad',
You can also reproduce this with two terminals, where one is running:
git init
while true; do
git commit -q --allow-empty -m foo
git repack -ad
git multi-pack-index write
done
(in effect, constantly writing new MIDXs), and the other is running:
obj=$(git rev-parse HEAD)
while true; do
echo $obj | git cat-file --batch-check='%(objectsize:disk)' || break
done
That will sometimes hit the error preparing packfile from
multi-pack-index message, which this patch fixes.
Right now, that path to discovering a missing pack looks something like
'find_pack_entry()' calling 'fill_midx_entry()' and eventually making
its way to call 'nth_midxed_pack_entry()'.
'nth_midxed_pack_entry()' already checks 'is_pack_valid()' and
propagates an error if the pack is invalid. So, this works if the pack
has gone away between calling 'prepare_midx_pack()' and before calling
'is_pack_valid()', but not if it disappears before then.
Catch the case where the pack has already disappeared before
'prepare_midx_pack()' by returning an error in that case, too.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-11-25 17:17:33 +00:00
|
|
|
return 0;
|
2018-07-12 19:39:34 +00:00
|
|
|
p = m->packs[pack_int_id];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are about to tell the caller where they can locate the
|
|
|
|
* requested object. We better make sure the packfile is
|
|
|
|
* still here and can be accessed before supplying that
|
|
|
|
* answer, as it may have been deleted since the MIDX was
|
|
|
|
* loaded!
|
|
|
|
*/
|
|
|
|
if (!is_pack_valid(p))
|
|
|
|
return 0;
|
|
|
|
|
2021-09-11 20:43:26 +00:00
|
|
|
if (oidset_size(&p->bad_objects) &&
|
|
|
|
oidset_contains(&p->bad_objects, oid))
|
|
|
|
return 0;
|
2018-08-20 16:51:57 +00:00
|
|
|
|
2018-07-12 19:39:34 +00:00
|
|
|
e->offset = nth_midxed_offset(m, pos);
|
|
|
|
e->p = p;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
midx: check both pack and index names for containment
A midx file (and the struct we parse from it) contains a list of all of
the covered packfiles, mentioned by their ".idx" names (e.g.,
"pack-1234.idx", etc). And thus calls to midx_contains_pack() expect
callers to provide the idx name.
This works for most of the calls, but the one in open_packed_git_1()
tries to feed a packed_git->pack_name, which is the ".pack" name,
meaning we'll never find a match (even if the pack is covered by the
midx).
We can fix this by converting the ".pack" to ".idx" in the caller.
However, that requires allocating a new string. Instead, let's make
midx_contains_pack() a bit friendlier, and allow it take _either_ the
.pack or .idx variant.
All cleverness in the matching code is credited to René. Bugs are mine.
There's no test here, because while this does fix _a_ bug, it's masked
by another bug in that same caller. That will be covered (with a test)
in the next patch.
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-05 18:06:04 +00:00
|
|
|
/* Match "foo.idx" against either "foo.pack" _or_ "foo.idx". */
|
2024-04-01 21:16:34 +00:00
|
|
|
int cmp_idx_or_pack_name(const char *idx_or_pack_name,
|
|
|
|
const char *idx_name)
|
midx: check both pack and index names for containment
A midx file (and the struct we parse from it) contains a list of all of
the covered packfiles, mentioned by their ".idx" names (e.g.,
"pack-1234.idx", etc). And thus calls to midx_contains_pack() expect
callers to provide the idx name.
This works for most of the calls, but the one in open_packed_git_1()
tries to feed a packed_git->pack_name, which is the ".pack" name,
meaning we'll never find a match (even if the pack is covered by the
midx).
We can fix this by converting the ".pack" to ".idx" in the caller.
However, that requires allocating a new string. Instead, let's make
midx_contains_pack() a bit friendlier, and allow it take _either_ the
.pack or .idx variant.
All cleverness in the matching code is credited to René. Bugs are mine.
There's no test here, because while this does fix _a_ bug, it's masked
by another bug in that same caller. That will be covered (with a test)
in the next patch.
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-05 18:06:04 +00:00
|
|
|
{
|
|
|
|
/* Skip past any initial matching prefix. */
|
|
|
|
while (*idx_name && *idx_name == *idx_or_pack_name) {
|
|
|
|
idx_name++;
|
|
|
|
idx_or_pack_name++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we didn't match completely, we may have matched "pack-1234." and
|
|
|
|
* be left with "idx" and "pack" respectively, which is also OK. We do
|
|
|
|
* not have to check for "idx" and "idx", because that would have been
|
|
|
|
* a complete match (and in that case these strcmps will be false, but
|
|
|
|
* we'll correctly return 0 from the final strcmp() below.
|
|
|
|
*
|
|
|
|
* Technically this matches "fooidx" and "foopack", but we'd never have
|
|
|
|
* such names in the first place.
|
|
|
|
*/
|
|
|
|
if (!strcmp(idx_name, "idx") && !strcmp(idx_or_pack_name, "pack"))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This not only checks for a complete match, but also orders based on
|
|
|
|
* the first non-identical character, which means our ordering will
|
|
|
|
* match a raw strcmp(). That makes it OK to use this to binary search
|
|
|
|
* a naively-sorted list.
|
|
|
|
*/
|
|
|
|
return strcmp(idx_or_pack_name, idx_name);
|
|
|
|
}
|
|
|
|
|
2023-12-14 22:23:54 +00:00
|
|
|
int midx_locate_pack(struct multi_pack_index *m, const char *idx_or_pack_name,
|
|
|
|
uint32_t *pos)
|
2018-07-12 19:39:36 +00:00
|
|
|
{
|
|
|
|
uint32_t first = 0, last = m->num_packs;
|
|
|
|
|
|
|
|
while (first < last) {
|
|
|
|
uint32_t mid = first + (last - first) / 2;
|
|
|
|
const char *current;
|
|
|
|
int cmp;
|
|
|
|
|
|
|
|
current = m->pack_names[mid];
|
midx: check both pack and index names for containment
A midx file (and the struct we parse from it) contains a list of all of
the covered packfiles, mentioned by their ".idx" names (e.g.,
"pack-1234.idx", etc). And thus calls to midx_contains_pack() expect
callers to provide the idx name.
This works for most of the calls, but the one in open_packed_git_1()
tries to feed a packed_git->pack_name, which is the ".pack" name,
meaning we'll never find a match (even if the pack is covered by the
midx).
We can fix this by converting the ".pack" to ".idx" in the caller.
However, that requires allocating a new string. Instead, let's make
midx_contains_pack() a bit friendlier, and allow it take _either_ the
.pack or .idx variant.
All cleverness in the matching code is credited to René. Bugs are mine.
There's no test here, because while this does fix _a_ bug, it's masked
by another bug in that same caller. That will be covered (with a test)
in the next patch.
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-05 18:06:04 +00:00
|
|
|
cmp = cmp_idx_or_pack_name(idx_or_pack_name, current);
|
2023-12-14 22:23:54 +00:00
|
|
|
if (!cmp) {
|
|
|
|
if (pos)
|
|
|
|
*pos = mid;
|
2018-07-12 19:39:36 +00:00
|
|
|
return 1;
|
2023-12-14 22:23:54 +00:00
|
|
|
}
|
2018-07-12 19:39:36 +00:00
|
|
|
if (cmp > 0) {
|
|
|
|
first = mid + 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
last = mid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-12-14 22:23:54 +00:00
|
|
|
int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name)
|
|
|
|
{
|
|
|
|
return midx_locate_pack(m, idx_or_pack_name, NULL);
|
|
|
|
}
|
|
|
|
|
midx: implement `midx_preferred_pack()`
When performing a binary search over the objects in a MIDX's bitmap
(i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack
ordering using a combination of (a) the preferred pack, (b) the pack's
lexical position in the MIDX based on pack names, and (c) the object
offset within the pack.
In order to perform this binary search, the reader must know the
identity of the preferred pack. This could be stored in the MIDX, but
isn't for historical reasons, mostly because it can easily be inferred
at read-time by looking at the object in the first bit position and
finding out which pack it was selected from in the MIDX, like so:
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
In midx_to_pack_pos() which performs this binary search, we look up the
identity of the preferred pack before each search. This is relatively
quick, since it involves two table-driven lookups (one in the MIDX's
revindex for `pack_pos_to_midx()`, and another in the MIDX's object
table for `nth_midxed_pack_int_id()`).
But since the preferred pack does not change after the MIDX is written,
it is safe to cache this value on the MIDX itself.
Write a helper to do just that, and rewrite all of the existing
call-sites that care about the identity of the preferred pack in terms
of this new helper.
This will prepare us for a subsequent patch where we will need to binary
search through the MIDX's pseudo-pack order multiple times.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:24:25 +00:00
|
|
|
int midx_preferred_pack(struct multi_pack_index *m, uint32_t *pack_int_id)
|
|
|
|
{
|
|
|
|
if (m->preferred_pack_idx == -1) {
|
|
|
|
if (load_midx_revindex(m) < 0) {
|
|
|
|
m->preferred_pack_idx = -2;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
m->preferred_pack_idx =
|
|
|
|
nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0));
|
|
|
|
} else if (m->preferred_pack_idx == -2)
|
|
|
|
return -1; /* no revindex */
|
|
|
|
|
|
|
|
*pack_int_id = m->preferred_pack_idx;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-08-20 16:51:55 +00:00
|
|
|
int prepare_multi_pack_index_one(struct repository *r, const char *object_dir, int local)
|
2018-07-12 19:39:33 +00:00
|
|
|
{
|
2018-08-20 16:52:00 +00:00
|
|
|
struct multi_pack_index *m;
|
2018-07-12 19:39:33 +00:00
|
|
|
struct multi_pack_index *m_search;
|
|
|
|
|
2020-09-25 12:33:34 +00:00
|
|
|
prepare_repo_settings(r);
|
|
|
|
if (!r->settings.core_multi_pack_index)
|
2018-07-12 19:39:33 +00:00
|
|
|
return 0;
|
|
|
|
|
2018-08-20 16:52:00 +00:00
|
|
|
for (m_search = r->objects->multi_pack_index; m_search; m_search = m_search->next)
|
2018-07-12 19:39:33 +00:00
|
|
|
if (!strcmp(object_dir, m_search->object_dir))
|
|
|
|
return 1;
|
|
|
|
|
2018-08-20 16:52:00 +00:00
|
|
|
m = load_multi_pack_index(object_dir, local);
|
2018-07-12 19:39:33 +00:00
|
|
|
|
2018-08-20 16:52:00 +00:00
|
|
|
if (m) {
|
midx: traverse the local MIDX first
When a repository has an alternate object directory configured, callers
can traverse through each alternate's MIDX by walking the '->next'
pointer.
But, when 'prepare_multi_pack_index_one()' loads multiple MIDXs, it
places the new ones at the front of this pointer chain, not at the end.
This can be confusing for callers such as 'git repack -ad', causing test
failures like in t7700.6 with 'GIT_TEST_MULTI_PACK_INDEX=1'.
The occurs when dropping a pack known to the local MIDX with alternates
configured that have their own MIDX. Since the alternate's MIDX is
returned via 'get_multi_pack_index()', 'midx_contains_pack()' returns
true (which is correct, since it traverses through the '->next' pointer
to find the MIDX in the chain that does contain the requested object).
But, we call 'clear_midx_file()' on 'the_repository', which drops the
MIDX at the path of the first MIDX in the chain, which (in the case of
t7700.6 is the one in the alternate).
This patch addresses that by:
- placing the local MIDX first in the chain when calling
'prepare_multi_pack_index_one()', and
- introducing a new 'get_local_multi_pack_index()', which explicitly
returns the repository-local MIDX, if any.
Don't impose an additional order on the MIDX's '->next' pointer beyond
that the first item in the chain must be local if one exists so that we
avoid a quadratic insertion.
Likewise, use 'get_local_multi_pack_index()' in
'remove_redundant_pack()' to fix the formerly broken t7700.6 when run
with 'GIT_TEST_MULTI_PACK_INDEX=1'.
Finally, note that the MIDX ordering invariant is only preserved by the
insertion order in 'prepare_packed_git()', which traverses through the
ODB's '->next' pointer, meaning we visit the local object store first.
This fragility makes this an undesirable long-term solution if more
callers are added, but it is acceptable for now since this is the only
caller.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-28 20:22:13 +00:00
|
|
|
struct multi_pack_index *mp = r->objects->multi_pack_index;
|
|
|
|
if (mp) {
|
|
|
|
m->next = mp->next;
|
|
|
|
mp->next = m;
|
|
|
|
} else
|
|
|
|
r->objects->multi_pack_index = m;
|
2018-07-12 19:39:33 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
int midx_checksum_valid(struct multi_pack_index *m)
|
2021-03-30 15:04:11 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
return hashfile_checksum_valid(m->data, m->data_len);
|
2021-03-30 15:04:11 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
struct clear_midx_data {
|
|
|
|
char *keep;
|
|
|
|
const char *ext;
|
2018-07-12 19:39:26 +00:00
|
|
|
};
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
static void clear_midx_file_ext(const char *full_path, size_t full_path_len UNUSED,
|
|
|
|
const char *file_name, void *_data)
|
2018-07-12 19:39:26 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
struct clear_midx_data *data = _data;
|
2018-07-12 19:39:26 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (!(starts_with(file_name, "multi-pack-index-") &&
|
|
|
|
ends_with(file_name, data->ext)))
|
|
|
|
return;
|
|
|
|
if (data->keep && !strcmp(data->keep, file_name))
|
|
|
|
return;
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (unlink(full_path))
|
|
|
|
die_errno(_("failed to remove %s"), full_path);
|
2018-07-12 19:39:26 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
void clear_midx_files_ext(const char *object_dir, const char *ext,
|
|
|
|
unsigned char *keep_hash)
|
2018-07-12 19:39:29 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
struct clear_midx_data data;
|
|
|
|
memset(&data, 0, sizeof(struct clear_midx_data));
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (keep_hash)
|
|
|
|
data.keep = xstrfmt("multi-pack-index-%s%s",
|
|
|
|
hash_to_hex(keep_hash), ext);
|
|
|
|
data.ext = ext;
|
2021-03-30 15:04:11 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
for_each_file_in_pack_dir(object_dir,
|
|
|
|
clear_midx_file_ext,
|
|
|
|
&data);
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
free(data.keep);
|
2018-07-12 19:39:29 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
void clear_midx_file(struct repository *r)
|
2018-07-12 19:39:36 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
struct strbuf midx = STRBUF_INIT;
|
2018-07-12 19:39:36 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
get_midx_filename(&midx, r->objects->odb->path);
|
2018-07-12 19:39:36 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (r->objects && r->objects->multi_pack_index) {
|
|
|
|
close_midx(r->objects->multi_pack_index);
|
|
|
|
r->objects->multi_pack_index = NULL;
|
|
|
|
}
|
2018-07-12 19:39:36 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (remove_path(midx.buf))
|
|
|
|
die(_("failed to clear multi-pack-index at %s"), midx.buf);
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
clear_midx_files_ext(r->objects->odb->path, ".bitmap", NULL);
|
|
|
|
clear_midx_files_ext(r->objects->odb->path, ".rev", NULL);
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
strbuf_release(&midx);
|
2018-07-12 19:39:29 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
static int verify_midx_error;
|
midx.c: extract `struct midx_fanout`
To build up a list of objects (along with their packs, and the offsets
within those packs that each object appears at), the MIDX code
implements `get_sorted_entries()` which builds up a list of candidates,
sorts them, and then removes duplicate entries.
To do this, it keeps an array of `pack_midx_entry` structures that it
builds up once for each fanout level (ie., for all possible values of
the first byte of each object's ID).
This array is a function-local variable of `get_sorted_entries()`. Since
it uses the ALLOC_GROW() macro, having the `alloc_fanout` variable also
be local to that function, and only modified within that function is
convenient.
However, subsequent changes will extract the two ways this array is
filled (from a pack at some fanout value, and from an existing MIDX at
some fanout value) into separate functions. Instead of passing around
pointers to the entries array, along with `nr_fanout` and
`alloc_fanout`, encapsulate these three into a structure instead. Then
pass around a pointer to this structure instead.
This patch does not yet extract the above two functions, but sets us up
to begin doing so in the following commit. For now, the implementation
of get_sorted_entries() is only modified to replace `entries_by_fanout`
with `fanout.entries`, `nr_fanout` with `fanout.nr`, and so on.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-08-22 19:50:38 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
__attribute__((format (printf, 1, 2)))
|
|
|
|
static void midx_report(const char *fmt, ...)
|
midx.c: extract `struct midx_fanout`
To build up a list of objects (along with their packs, and the offsets
within those packs that each object appears at), the MIDX code
implements `get_sorted_entries()` which builds up a list of candidates,
sorts them, and then removes duplicate entries.
To do this, it keeps an array of `pack_midx_entry` structures that it
builds up once for each fanout level (ie., for all possible values of
the first byte of each object's ID).
This array is a function-local variable of `get_sorted_entries()`. Since
it uses the ALLOC_GROW() macro, having the `alloc_fanout` variable also
be local to that function, and only modified within that function is
convenient.
However, subsequent changes will extract the two ways this array is
filled (from a pack at some fanout value, and from an existing MIDX at
some fanout value) into separate functions. Instead of passing around
pointers to the entries array, along with `nr_fanout` and
`alloc_fanout`, encapsulate these three into a structure instead. Then
pass around a pointer to this structure instead.
This patch does not yet extract the above two functions, but sets us up
to begin doing so in the following commit. For now, the implementation
of get_sorted_entries() is only modified to replace `entries_by_fanout`
with `fanout.entries`, `nr_fanout` with `fanout.nr`, and so on.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-08-22 19:50:38 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
va_list ap;
|
|
|
|
verify_midx_error = 1;
|
|
|
|
va_start(ap, fmt);
|
|
|
|
vfprintf(stderr, fmt, ap);
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
va_end(ap);
|
midx.c: extract `struct midx_fanout`
To build up a list of objects (along with their packs, and the offsets
within those packs that each object appears at), the MIDX code
implements `get_sorted_entries()` which builds up a list of candidates,
sorts them, and then removes duplicate entries.
To do this, it keeps an array of `pack_midx_entry` structures that it
builds up once for each fanout level (ie., for all possible values of
the first byte of each object's ID).
This array is a function-local variable of `get_sorted_entries()`. Since
it uses the ALLOC_GROW() macro, having the `alloc_fanout` variable also
be local to that function, and only modified within that function is
convenient.
However, subsequent changes will extract the two ways this array is
filled (from a pack at some fanout value, and from an existing MIDX at
some fanout value) into separate functions. Instead of passing around
pointers to the entries array, along with `nr_fanout` and
`alloc_fanout`, encapsulate these three into a structure instead. Then
pass around a pointer to this structure instead.
This patch does not yet extract the above two functions, but sets us up
to begin doing so in the following commit. For now, the implementation
of get_sorted_entries() is only modified to replace `entries_by_fanout`
with `fanout.entries`, `nr_fanout` with `fanout.nr`, and so on.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-08-22 19:50:38 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
struct pair_pos_vs_id
|
midx.c: extract `struct midx_fanout`
To build up a list of objects (along with their packs, and the offsets
within those packs that each object appears at), the MIDX code
implements `get_sorted_entries()` which builds up a list of candidates,
sorts them, and then removes duplicate entries.
To do this, it keeps an array of `pack_midx_entry` structures that it
builds up once for each fanout level (ie., for all possible values of
the first byte of each object's ID).
This array is a function-local variable of `get_sorted_entries()`. Since
it uses the ALLOC_GROW() macro, having the `alloc_fanout` variable also
be local to that function, and only modified within that function is
convenient.
However, subsequent changes will extract the two ways this array is
filled (from a pack at some fanout value, and from an existing MIDX at
some fanout value) into separate functions. Instead of passing around
pointers to the entries array, along with `nr_fanout` and
`alloc_fanout`, encapsulate these three into a structure instead. Then
pass around a pointer to this structure instead.
This patch does not yet extract the above two functions, but sets us up
to begin doing so in the following commit. For now, the implementation
of get_sorted_entries() is only modified to replace `entries_by_fanout`
with `fanout.entries`, `nr_fanout` with `fanout.nr`, and so on.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-08-22 19:50:38 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
uint32_t pos;
|
|
|
|
uint32_t pack_int_id;
|
|
|
|
};
|
midx.c: extract `struct midx_fanout`
To build up a list of objects (along with their packs, and the offsets
within those packs that each object appears at), the MIDX code
implements `get_sorted_entries()` which builds up a list of candidates,
sorts them, and then removes duplicate entries.
To do this, it keeps an array of `pack_midx_entry` structures that it
builds up once for each fanout level (ie., for all possible values of
the first byte of each object's ID).
This array is a function-local variable of `get_sorted_entries()`. Since
it uses the ALLOC_GROW() macro, having the `alloc_fanout` variable also
be local to that function, and only modified within that function is
convenient.
However, subsequent changes will extract the two ways this array is
filled (from a pack at some fanout value, and from an existing MIDX at
some fanout value) into separate functions. Instead of passing around
pointers to the entries array, along with `nr_fanout` and
`alloc_fanout`, encapsulate these three into a structure instead. Then
pass around a pointer to this structure instead.
This patch does not yet extract the above two functions, but sets us up
to begin doing so in the following commit. For now, the implementation
of get_sorted_entries() is only modified to replace `entries_by_fanout`
with `fanout.entries`, `nr_fanout` with `fanout.nr`, and so on.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-08-22 19:50:38 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
static int compare_pair_pos_vs_id(const void *_a, const void *_b)
|
2022-08-22 19:50:41 +00:00
|
|
|
{
|
2024-04-01 21:16:34 +00:00
|
|
|
struct pair_pos_vs_id *a = (struct pair_pos_vs_id *)_a;
|
|
|
|
struct pair_pos_vs_id *b = (struct pair_pos_vs_id *)_b;
|
2022-08-22 19:50:41 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
return b->pack_int_id - a->pack_int_id;
|
2022-08-22 19:50:43 +00:00
|
|
|
}
|
|
|
|
|
2018-07-12 19:39:29 +00:00
|
|
|
/*
|
2024-04-01 21:16:34 +00:00
|
|
|
* Limit calls to display_progress() for performance reasons.
|
|
|
|
* The interval here was arbitrarily chosen.
|
2018-07-12 19:39:29 +00:00
|
|
|
*/
|
2024-04-01 21:16:34 +00:00
|
|
|
#define SPARSE_PROGRESS_INTERVAL (1 << 12)
|
|
|
|
#define midx_display_sparse_progress(progress, n) \
|
|
|
|
do { \
|
|
|
|
uint64_t _n = (n); \
|
|
|
|
if ((_n & (SPARSE_PROGRESS_INTERVAL - 1)) == 0) \
|
|
|
|
display_progress(progress, _n); \
|
|
|
|
} while (0)
|
2018-07-12 19:39:36 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
int verify_midx_file(struct repository *r, const char *object_dir, unsigned flags)
|
|
|
|
{
|
|
|
|
struct pair_pos_vs_id *pairs = NULL;
|
|
|
|
uint32_t i;
|
|
|
|
struct progress *progress = NULL;
|
|
|
|
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
|
|
|
|
verify_midx_error = 0;
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (!m) {
|
|
|
|
int result = 0;
|
|
|
|
struct stat sb;
|
|
|
|
struct strbuf filename = STRBUF_INIT;
|
midx.c: include preferred pack correctly with existing MIDX
This patch resolves an issue where the object order used to generate a
MIDX bitmap would violate an invariant that all of the preferred pack's
objects are represented by that pack in the MIDX.
The problem arises when reusing an existing MIDX while generating a new
one, and occurs specifically when the identity of the preferred pack
changes from one MIDX to another, along with a few other conditions:
- the new preferred pack must also be present in the existing MIDX
- the new preferred pack must *not* have been the preferred pack in
the existing MIDX
- most importantly, there must be at least one object present in the
physical preferred pack (ie., it shows up in that pack's index)
but was selected from a *different* pack when the previous MIDX
was generated
When the above conditions are all met, we end up (incorrectly)
discarding copies of some objects in the pack selected as the preferred
pack. This is because `get_sorted_entries()` adds objects to its list
by doing the following at each fanout level:
- first, adding all objects from that fanout level from an existing
MIDX
- then, adding all objects from that fanout level in each pack *not*
included in the existing MIDX
So if some object was not selected from the to-be-preferred pack when
writing the previous MIDX, then we will never consider it as a candidate
when generating the new MIDX. This means that it's possible for the
preferred pack to not include all of its objects in the MIDX's
pseudo-pack object order, which is an invariant violation of that order.
Resolve this by adding all objects from the preferred pack separately
when it appears in the existing MIDX (if one was present). This will
duplicate objects from that pack that *did* appear in the MIDX, but this
is fine, since get_sorted_entries() already handles duplicates. (A
future optimization in this area could avoid adding copies of objects
that we know already existing in the MIDX.)
Note that we no longer need to compute the preferred-ness of objects
added from the MIDX, since we only want to select the preferred objects
from a single source. (We could still mark these preferred bits, but
doing so is redundant and unnecessary).
This resolves the bug demonstrated by t5326.174 ("preferred pack change
with existing MIDX bitmap").
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-08-22 19:50:46 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
get_midx_filename(&filename, object_dir);
|
2018-07-12 19:39:29 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (!stat(filename.buf, &sb)) {
|
|
|
|
error(_("multi-pack-index file exists, but failed to parse"));
|
|
|
|
result = 1;
|
2018-07-12 19:39:29 +00:00
|
|
|
}
|
2024-04-01 21:16:34 +00:00
|
|
|
strbuf_release(&filename);
|
|
|
|
return result;
|
2018-07-12 19:39:29 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (!midx_checksum_valid(m))
|
|
|
|
midx_report(_("incorrect checksum"));
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_delayed_progress(_("Looking for referenced packfiles"),
|
|
|
|
m->num_packs);
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
|
|
|
if (prepare_midx_pack(r, m, i))
|
|
|
|
midx_report("failed to load pack in position %d", i);
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
display_progress(progress, i + 1);
|
2018-07-12 19:39:27 +00:00
|
|
|
}
|
2024-04-01 21:16:34 +00:00
|
|
|
stop_progress(&progress);
|
2018-07-12 19:39:27 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (m->num_objects == 0) {
|
|
|
|
midx_report(_("the midx contains no oid"));
|
|
|
|
/*
|
|
|
|
* Remaining tests assume that we have objects, so we can
|
|
|
|
* return here.
|
|
|
|
*/
|
|
|
|
goto cleanup;
|
2018-07-12 19:39:27 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_sparse_progress(_("Verifying OID order in multi-pack-index"),
|
|
|
|
m->num_objects - 1);
|
|
|
|
for (i = 0; i < m->num_objects - 1; i++) {
|
|
|
|
struct object_id oid1, oid2;
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
nth_midxed_object_oid(&oid1, m, i);
|
|
|
|
nth_midxed_object_oid(&oid2, m, i + 1);
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (oidcmp(&oid1, &oid2) >= 0)
|
|
|
|
midx_report(_("oid lookup out of order: oid[%d] = %s >= %s = oid[%d]"),
|
|
|
|
i, oid_to_hex(&oid1), oid_to_hex(&oid2), i + 1);
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
midx_display_sparse_progress(progress, i + 1);
|
midx: implement `BTMP` chunk
When a multi-pack bitmap is used to implement verbatim pack reuse (that
is, when verbatim chunks from an on-disk packfile are copied
directly[^1]), it does so by using its "preferred pack" as the source
for pack-reuse.
This allows repositories to pack the majority of their objects into a
single (often large) pack, and then use it as the single source for
verbatim pack reuse. This increases the amount of objects that are
reused verbatim (and consequently, decrease the amount of time it takes
to generate many packs). But this performance comes at a cost, which is
that the preferred packfile must pace its growth with that of the entire
repository in order to maintain the utility of verbatim pack reuse.
As repositories grow beyond what we can reasonably store in a single
packfile, the utility of verbatim pack reuse diminishes. Or, at the very
least, it becomes increasingly more expensive to maintain as the pack
grows larger and larger.
It would be beneficial to be able to perform this same optimization over
multiple packs, provided some modest constraints (most importantly, that
the set of packs eligible for verbatim reuse are disjoint with respect
to the subset of their objects being sent).
If we assume that the packs which we treat as candidates for verbatim
reuse are disjoint with respect to any of their objects we may output,
we need to make only modest modifications to the verbatim pack-reuse
code itself. Most notably, we need to remove the assumption that the
bits in the reachability bitmap corresponding to objects from the single
reuse pack begin at the first bit position.
Future patches will unwind these assumptions and reimplement their
existing functionality as special cases of the more general assumptions
(e.g. that reuse bits can start anywhere within the bitset, but happen
to start at 0 for all existing cases).
This patch does not yet relax any of those assumptions. Instead, it
implements a foundational data-structure, the "Bitampped Packs" (`BTMP`)
chunk of the multi-pack index. The `BTMP` chunk's contents are described
in detail here. Importantly, the `BTMP` chunk contains information to
map regions of a multi-pack index's reachability bitmap to the packs
whose objects they represent.
For now, this chunk is only written, not read (outside of the test-tool
used in this patch to test the new chunk's behavior). Future patches
will begin to make use of this new chunk.
[^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the
pack that wasn't used verbatim.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-12-14 22:23:51 +00:00
|
|
|
}
|
2024-04-01 21:16:34 +00:00
|
|
|
stop_progress(&progress);
|
2018-07-12 19:39:31 +00:00
|
|
|
|
|
|
|
/*
|
2024-04-01 21:16:34 +00:00
|
|
|
* Create an array mapping each object to its packfile id. Sort it
|
|
|
|
* to group the objects by packfile. Use this permutation to visit
|
|
|
|
* each of the objects and only require 1 packfile to be open at a
|
|
|
|
* time.
|
|
|
|
*/
|
|
|
|
ALLOC_ARRAY(pairs, m->num_objects);
|
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
pairs[i].pos = i;
|
|
|
|
pairs[i].pack_int_id = nth_midxed_pack_int_id(m, i);
|
|
|
|
}
|
2018-07-12 19:39:31 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_sparse_progress(_("Sorting objects by packfile"),
|
|
|
|
m->num_objects);
|
|
|
|
display_progress(progress, 0); /* TODO: Measure QSORT() progress */
|
|
|
|
QSORT(pairs, m->num_objects, compare_pair_pos_vs_id);
|
|
|
|
stop_progress(&progress);
|
2018-07-12 19:39:31 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_sparse_progress(_("Verifying object offsets"), m->num_objects);
|
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
struct object_id oid;
|
|
|
|
struct pack_entry e;
|
|
|
|
off_t m_offset, p_offset;
|
2018-07-12 19:39:31 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (i > 0 && pairs[i-1].pack_int_id != pairs[i].pack_int_id &&
|
|
|
|
m->packs[pairs[i-1].pack_int_id])
|
|
|
|
{
|
|
|
|
close_pack_fd(m->packs[pairs[i-1].pack_int_id]);
|
|
|
|
close_pack_index(m->packs[pairs[i-1].pack_int_id]);
|
|
|
|
}
|
2018-07-12 19:39:31 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
nth_midxed_object_oid(&oid, m, pairs[i].pos);
|
2018-07-12 19:39:30 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (!fill_midx_entry(r, &oid, &e, m)) {
|
|
|
|
midx_report(_("failed to load pack entry for oid[%d] = %s"),
|
|
|
|
pairs[i].pos, oid_to_hex(&oid));
|
|
|
|
continue;
|
|
|
|
}
|
2018-07-12 19:39:30 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (open_pack_index(e.p)) {
|
|
|
|
midx_report(_("failed to load pack-index for packfile %s"),
|
|
|
|
e.p->pack_name);
|
|
|
|
break;
|
2018-07-12 19:39:30 +00:00
|
|
|
}
|
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
m_offset = e.offset;
|
|
|
|
p_offset = find_pack_entry_one(oid.hash, e.p);
|
2018-07-12 19:39:30 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
if (m_offset != p_offset)
|
|
|
|
midx_report(_("incorrect object offset for oid[%d] = %s: %"PRIx64" != %"PRIx64),
|
|
|
|
pairs[i].pos, oid_to_hex(&oid), m_offset, p_offset);
|
2018-07-12 19:39:30 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
midx_display_sparse_progress(progress, i + 1);
|
2018-07-12 19:39:32 +00:00
|
|
|
}
|
2024-04-01 21:16:34 +00:00
|
|
|
stop_progress(&progress);
|
2018-07-12 19:39:32 +00:00
|
|
|
|
2024-04-01 21:16:34 +00:00
|
|
|
cleanup:
|
|
|
|
free(pairs);
|
|
|
|
close_midx(m);
|
2019-03-21 19:36:15 +00:00
|
|
|
|
2018-09-13 18:02:13 +00:00
|
|
|
return verify_midx_error;
|
|
|
|
}
|