global: introduce `USE_THE_REPOSITORY_VARIABLE` macro
Use of the `the_repository` variable is deprecated nowadays, and we
slowly but steadily convert the codebase to not use it anymore. Instead,
callers should be passing down the repository to work on via parameters.
It is hard though to prove that a given code unit does not use this
variable anymore. The most trivial case, merely demonstrating that there
is no direct use of `the_repository`, is already a bit of a pain during
code reviews as the reviewer needs to manually verify claims made by the
patch author. The bigger problem though is that we have many interfaces
that implicitly rely on `the_repository`.
Introduce a new `USE_THE_REPOSITORY_VARIABLE` macro that allows code
units to opt into usage of `the_repository`. The intent of this macro is
to demonstrate that a certain code unit does not use this variable
anymore, and to keep it from new dependencies on it in future changes,
be it explicit or implicit
For now, the macro only guards `the_repository` itself as well as
`the_hash_algo`. There are many more known interfaces where we have an
implicit dependency on `the_repository`, but those are not guarded at
the current point in time. Over time though, we should start to add
guards as required (or even better, just remove them).
Define the macro as required in our code units. As expected, most of our
code still relies on the global variable. Nearly all of our builtins
rely on the variable as there is no way yet to pass `the_repository` to
their entry point. For now, declare the macro in "biultin.h" to keep the
required changes at least a little bit more contained.
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2024-06-14 06:50:23 +00:00
|
|
|
#define USE_THE_REPOSITORY_VARIABLE
|
|
|
|
|
2023-02-24 00:09:24 +00:00
|
|
|
#include "git-compat-util.h"
|
2021-02-18 14:07:24 +00:00
|
|
|
#include "chunk-format.h"
|
|
|
|
#include "csum-file.h"
|
2023-03-21 06:25:54 +00:00
|
|
|
#include "gettext.h"
|
2023-04-22 20:17:20 +00:00
|
|
|
#include "hash.h"
|
2023-03-21 06:26:08 +00:00
|
|
|
#include "trace2.h"
|
2021-02-18 14:07:24 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When writing a chunk-based file format, collect the chunks in
|
|
|
|
* an array of chunk_info structs. The size stores the _expected_
|
|
|
|
* amount of data that will be written by write_fn.
|
|
|
|
*/
|
|
|
|
struct chunk_info {
|
|
|
|
uint32_t id;
|
|
|
|
uint64_t size;
|
|
|
|
chunk_write_fn write_fn;
|
2021-02-18 14:07:34 +00:00
|
|
|
|
|
|
|
const void *start;
|
2021-02-18 14:07:24 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct chunkfile {
|
|
|
|
struct hashfile *f;
|
|
|
|
|
|
|
|
struct chunk_info *chunks;
|
|
|
|
size_t chunks_nr;
|
|
|
|
size_t chunks_alloc;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct chunkfile *init_chunkfile(struct hashfile *f)
|
|
|
|
{
|
|
|
|
struct chunkfile *cf = xcalloc(1, sizeof(*cf));
|
|
|
|
cf->f = f;
|
|
|
|
return cf;
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_chunkfile(struct chunkfile *cf)
|
|
|
|
{
|
|
|
|
if (!cf)
|
|
|
|
return;
|
|
|
|
free(cf->chunks);
|
|
|
|
free(cf);
|
|
|
|
}
|
|
|
|
|
|
|
|
int get_num_chunks(struct chunkfile *cf)
|
|
|
|
{
|
|
|
|
return cf->chunks_nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void add_chunk(struct chunkfile *cf,
|
|
|
|
uint32_t id,
|
|
|
|
size_t size,
|
|
|
|
chunk_write_fn fn)
|
|
|
|
{
|
|
|
|
ALLOC_GROW(cf->chunks, cf->chunks_nr + 1, cf->chunks_alloc);
|
|
|
|
|
|
|
|
cf->chunks[cf->chunks_nr].id = id;
|
|
|
|
cf->chunks[cf->chunks_nr].write_fn = fn;
|
|
|
|
cf->chunks[cf->chunks_nr].size = size;
|
|
|
|
cf->chunks_nr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
int write_chunkfile(struct chunkfile *cf, void *data)
|
|
|
|
{
|
csum-file.h: increase hashfile buffer size
The hashfile API uses a hard-coded buffer size of 8KB and has ever since
it was introduced in c38138c (git-pack-objects: write the pack files
with a SHA1 csum, 2005-06-26). It performs a similar function to the
hashing buffers in read-cache.c, but that code was updated from 8KB to
128KB in f279894 (read-cache: make the index write buffer size 128K,
2021-02-18). The justification there was that do_write_index() improves
from 1.02s to 0.72s. Since our end goal is to have the index writing
code use the hashfile API, we need to unify this buffer size to avoid a
performance regression.
There is a buffer, 'check_buffer', that is used to verify the check_fd
file descriptor. When this buffer increases to 128K to fit the data
being flushed, it causes the stack to overflow the limits placed in the
test suite. To avoid issues with stack size, move both 'buffer' and
'check_buffer' to be heap pointers within 'struct hashfile'. The
'check_buffer' member is left as NULL unless check_fd is set in
hashfd_check(). Both buffers are cleared as part of finalize_hashfile()
which also frees the full structure.
Since these buffers are now on the heap, we can adjust their size based
on the needs of the consumer. In particular, callers to
hashfd_throughput() are expecting to report progress indicators as the
buffer flushes. These callers would prefer the smaller 8k buffer to
avoid large delays between updates, especially for users with slower
networks. When the progress indicator is not used, the larger buffer is
preferrable.
By adding a new trace2 region in the chunk-format API, we can see that
the writing portion of 'git multi-pack-index write' lowers from ~1.49s
to ~1.47s on a Linux machine. These effects may be more pronounced or
diminished on other filesystems. The end-to-end timing is too noisy to
have a definitive change either way.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-18 18:32:46 +00:00
|
|
|
int i, result = 0;
|
2021-02-18 14:07:24 +00:00
|
|
|
uint64_t cur_offset = hashfile_total(cf->f);
|
|
|
|
|
csum-file.h: increase hashfile buffer size
The hashfile API uses a hard-coded buffer size of 8KB and has ever since
it was introduced in c38138c (git-pack-objects: write the pack files
with a SHA1 csum, 2005-06-26). It performs a similar function to the
hashing buffers in read-cache.c, but that code was updated from 8KB to
128KB in f279894 (read-cache: make the index write buffer size 128K,
2021-02-18). The justification there was that do_write_index() improves
from 1.02s to 0.72s. Since our end goal is to have the index writing
code use the hashfile API, we need to unify this buffer size to avoid a
performance regression.
There is a buffer, 'check_buffer', that is used to verify the check_fd
file descriptor. When this buffer increases to 128K to fit the data
being flushed, it causes the stack to overflow the limits placed in the
test suite. To avoid issues with stack size, move both 'buffer' and
'check_buffer' to be heap pointers within 'struct hashfile'. The
'check_buffer' member is left as NULL unless check_fd is set in
hashfd_check(). Both buffers are cleared as part of finalize_hashfile()
which also frees the full structure.
Since these buffers are now on the heap, we can adjust their size based
on the needs of the consumer. In particular, callers to
hashfd_throughput() are expecting to report progress indicators as the
buffer flushes. These callers would prefer the smaller 8k buffer to
avoid large delays between updates, especially for users with slower
networks. When the progress indicator is not used, the larger buffer is
preferrable.
By adding a new trace2 region in the chunk-format API, we can see that
the writing portion of 'git multi-pack-index write' lowers from ~1.49s
to ~1.47s on a Linux machine. These effects may be more pronounced or
diminished on other filesystems. The end-to-end timing is too noisy to
have a definitive change either way.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-18 18:32:46 +00:00
|
|
|
trace2_region_enter("chunkfile", "write", the_repository);
|
|
|
|
|
2021-02-18 14:07:24 +00:00
|
|
|
/* Add the table of contents to the current offset */
|
|
|
|
cur_offset += (cf->chunks_nr + 1) * CHUNK_TOC_ENTRY_SIZE;
|
|
|
|
|
|
|
|
for (i = 0; i < cf->chunks_nr; i++) {
|
|
|
|
hashwrite_be32(cf->f, cf->chunks[i].id);
|
|
|
|
hashwrite_be64(cf->f, cur_offset);
|
|
|
|
|
|
|
|
cur_offset += cf->chunks[i].size;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Trailing entry marks the end of the chunks */
|
|
|
|
hashwrite_be32(cf->f, 0);
|
|
|
|
hashwrite_be64(cf->f, cur_offset);
|
|
|
|
|
|
|
|
for (i = 0; i < cf->chunks_nr; i++) {
|
|
|
|
off_t start_offset = hashfile_total(cf->f);
|
csum-file.h: increase hashfile buffer size
The hashfile API uses a hard-coded buffer size of 8KB and has ever since
it was introduced in c38138c (git-pack-objects: write the pack files
with a SHA1 csum, 2005-06-26). It performs a similar function to the
hashing buffers in read-cache.c, but that code was updated from 8KB to
128KB in f279894 (read-cache: make the index write buffer size 128K,
2021-02-18). The justification there was that do_write_index() improves
from 1.02s to 0.72s. Since our end goal is to have the index writing
code use the hashfile API, we need to unify this buffer size to avoid a
performance regression.
There is a buffer, 'check_buffer', that is used to verify the check_fd
file descriptor. When this buffer increases to 128K to fit the data
being flushed, it causes the stack to overflow the limits placed in the
test suite. To avoid issues with stack size, move both 'buffer' and
'check_buffer' to be heap pointers within 'struct hashfile'. The
'check_buffer' member is left as NULL unless check_fd is set in
hashfd_check(). Both buffers are cleared as part of finalize_hashfile()
which also frees the full structure.
Since these buffers are now on the heap, we can adjust their size based
on the needs of the consumer. In particular, callers to
hashfd_throughput() are expecting to report progress indicators as the
buffer flushes. These callers would prefer the smaller 8k buffer to
avoid large delays between updates, especially for users with slower
networks. When the progress indicator is not used, the larger buffer is
preferrable.
By adding a new trace2 region in the chunk-format API, we can see that
the writing portion of 'git multi-pack-index write' lowers from ~1.49s
to ~1.47s on a Linux machine. These effects may be more pronounced or
diminished on other filesystems. The end-to-end timing is too noisy to
have a definitive change either way.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-18 18:32:46 +00:00
|
|
|
result = cf->chunks[i].write_fn(cf->f, data);
|
2021-02-18 14:07:24 +00:00
|
|
|
|
|
|
|
if (result)
|
csum-file.h: increase hashfile buffer size
The hashfile API uses a hard-coded buffer size of 8KB and has ever since
it was introduced in c38138c (git-pack-objects: write the pack files
with a SHA1 csum, 2005-06-26). It performs a similar function to the
hashing buffers in read-cache.c, but that code was updated from 8KB to
128KB in f279894 (read-cache: make the index write buffer size 128K,
2021-02-18). The justification there was that do_write_index() improves
from 1.02s to 0.72s. Since our end goal is to have the index writing
code use the hashfile API, we need to unify this buffer size to avoid a
performance regression.
There is a buffer, 'check_buffer', that is used to verify the check_fd
file descriptor. When this buffer increases to 128K to fit the data
being flushed, it causes the stack to overflow the limits placed in the
test suite. To avoid issues with stack size, move both 'buffer' and
'check_buffer' to be heap pointers within 'struct hashfile'. The
'check_buffer' member is left as NULL unless check_fd is set in
hashfd_check(). Both buffers are cleared as part of finalize_hashfile()
which also frees the full structure.
Since these buffers are now on the heap, we can adjust their size based
on the needs of the consumer. In particular, callers to
hashfd_throughput() are expecting to report progress indicators as the
buffer flushes. These callers would prefer the smaller 8k buffer to
avoid large delays between updates, especially for users with slower
networks. When the progress indicator is not used, the larger buffer is
preferrable.
By adding a new trace2 region in the chunk-format API, we can see that
the writing portion of 'git multi-pack-index write' lowers from ~1.49s
to ~1.47s on a Linux machine. These effects may be more pronounced or
diminished on other filesystems. The end-to-end timing is too noisy to
have a definitive change either way.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-18 18:32:46 +00:00
|
|
|
goto cleanup;
|
2021-02-18 14:07:24 +00:00
|
|
|
|
|
|
|
if (hashfile_total(cf->f) - start_offset != cf->chunks[i].size)
|
|
|
|
BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
|
|
|
|
cf->chunks[i].size, cf->chunks[i].id,
|
|
|
|
hashfile_total(cf->f) - start_offset);
|
|
|
|
}
|
|
|
|
|
csum-file.h: increase hashfile buffer size
The hashfile API uses a hard-coded buffer size of 8KB and has ever since
it was introduced in c38138c (git-pack-objects: write the pack files
with a SHA1 csum, 2005-06-26). It performs a similar function to the
hashing buffers in read-cache.c, but that code was updated from 8KB to
128KB in f279894 (read-cache: make the index write buffer size 128K,
2021-02-18). The justification there was that do_write_index() improves
from 1.02s to 0.72s. Since our end goal is to have the index writing
code use the hashfile API, we need to unify this buffer size to avoid a
performance regression.
There is a buffer, 'check_buffer', that is used to verify the check_fd
file descriptor. When this buffer increases to 128K to fit the data
being flushed, it causes the stack to overflow the limits placed in the
test suite. To avoid issues with stack size, move both 'buffer' and
'check_buffer' to be heap pointers within 'struct hashfile'. The
'check_buffer' member is left as NULL unless check_fd is set in
hashfd_check(). Both buffers are cleared as part of finalize_hashfile()
which also frees the full structure.
Since these buffers are now on the heap, we can adjust their size based
on the needs of the consumer. In particular, callers to
hashfd_throughput() are expecting to report progress indicators as the
buffer flushes. These callers would prefer the smaller 8k buffer to
avoid large delays between updates, especially for users with slower
networks. When the progress indicator is not used, the larger buffer is
preferrable.
By adding a new trace2 region in the chunk-format API, we can see that
the writing portion of 'git multi-pack-index write' lowers from ~1.49s
to ~1.47s on a Linux machine. These effects may be more pronounced or
diminished on other filesystems. The end-to-end timing is too noisy to
have a definitive change either way.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-18 18:32:46 +00:00
|
|
|
cleanup:
|
|
|
|
trace2_region_leave("chunkfile", "write", the_repository);
|
|
|
|
return result;
|
2021-02-18 14:07:24 +00:00
|
|
|
}
|
2021-02-18 14:07:34 +00:00
|
|
|
|
|
|
|
int read_table_of_contents(struct chunkfile *cf,
|
|
|
|
const unsigned char *mfile,
|
|
|
|
size_t mfile_size,
|
|
|
|
uint64_t toc_offset,
|
midx: enforce chunk alignment on reading
The midx reader assumes chunks are aligned to a 4-byte boundary: we
treat the fanout chunk as an array of uint32_t, indexing it to feed the
results to ntohl(). Without aligning the chunks, we may violate the
CPU's alignment constraints. Though many platforms allow this, some do
not. And certanily UBSan will complain, since it is undefined behavior.
Even though most chunks are naturally 4-byte-aligned (because they are
storing uint32_t or larger types), PNAM is not. It stores NUL-terminated
pack names, so you can have a valid chunk with any length. The writing
side handles this by 4-byte-aligning the chunk, introducing a few extra
NULs as necessary. But since we don't check this on the reading side, we
may end up with a misaligned fanout and trigger the undefined behavior.
We have two options here:
1. Swap out ntohl(fanout[i]) for get_be32(fanout+i) everywhere. The
latter handles alignment itself. It's possible that it's slightly
slower (though in practice I'm not sure how true that is,
especially for these code paths which then go on to do a binary
search).
2. Enforce the alignment when reading the chunks. This is easy to do,
since the table-of-contents reader can check it in one spot.
I went with the second option here, just because it places less burden
on maintenance going forward (it is OK to continue using ntohl), and we
know it can't have any performance impact on the actual reads.
The commit-graph code uses the same chunk API. It's usually also 4-byte
aligned, but some chunks are not (like Bloom filter BDAT chunks). So
we'll pass "1" here to allow any alignment. It doesn't suffer from the
same problem as midx with its fanout because the fanout chunk is always
the first (and the rest of the format dictates that the first chunk will
start aligned).
The new test shows the effect on a midx with a misaligned PNAM chunk.
Note that the midx-reading code treats chunk-toc errors as soft, falling
back to the non-midx path rather than calling die(), as we do for other
parsing errors. Arguably we should make all of these behave the same,
but that's out of scope for this patch. For now the test just expects
the fallback behavior.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 21:05:23 +00:00
|
|
|
int toc_length,
|
|
|
|
unsigned expected_alignment)
|
2021-02-18 14:07:34 +00:00
|
|
|
{
|
2021-02-18 14:07:38 +00:00
|
|
|
int i;
|
2021-02-18 14:07:34 +00:00
|
|
|
uint32_t chunk_id;
|
|
|
|
const unsigned char *table_of_contents = mfile + toc_offset;
|
|
|
|
|
|
|
|
ALLOC_GROW(cf->chunks, toc_length, cf->chunks_alloc);
|
|
|
|
|
|
|
|
while (toc_length--) {
|
|
|
|
uint64_t chunk_offset, next_chunk_offset;
|
|
|
|
|
|
|
|
chunk_id = get_be32(table_of_contents);
|
|
|
|
chunk_offset = get_be64(table_of_contents + 4);
|
|
|
|
|
|
|
|
if (!chunk_id) {
|
|
|
|
error(_("terminating chunk id appears earlier than expected"));
|
|
|
|
return 1;
|
|
|
|
}
|
midx: enforce chunk alignment on reading
The midx reader assumes chunks are aligned to a 4-byte boundary: we
treat the fanout chunk as an array of uint32_t, indexing it to feed the
results to ntohl(). Without aligning the chunks, we may violate the
CPU's alignment constraints. Though many platforms allow this, some do
not. And certanily UBSan will complain, since it is undefined behavior.
Even though most chunks are naturally 4-byte-aligned (because they are
storing uint32_t or larger types), PNAM is not. It stores NUL-terminated
pack names, so you can have a valid chunk with any length. The writing
side handles this by 4-byte-aligning the chunk, introducing a few extra
NULs as necessary. But since we don't check this on the reading side, we
may end up with a misaligned fanout and trigger the undefined behavior.
We have two options here:
1. Swap out ntohl(fanout[i]) for get_be32(fanout+i) everywhere. The
latter handles alignment itself. It's possible that it's slightly
slower (though in practice I'm not sure how true that is,
especially for these code paths which then go on to do a binary
search).
2. Enforce the alignment when reading the chunks. This is easy to do,
since the table-of-contents reader can check it in one spot.
I went with the second option here, just because it places less burden
on maintenance going forward (it is OK to continue using ntohl), and we
know it can't have any performance impact on the actual reads.
The commit-graph code uses the same chunk API. It's usually also 4-byte
aligned, but some chunks are not (like Bloom filter BDAT chunks). So
we'll pass "1" here to allow any alignment. It doesn't suffer from the
same problem as midx with its fanout because the fanout chunk is always
the first (and the rest of the format dictates that the first chunk will
start aligned).
The new test shows the effect on a midx with a misaligned PNAM chunk.
Note that the midx-reading code treats chunk-toc errors as soft, falling
back to the non-midx path rather than calling die(), as we do for other
parsing errors. Arguably we should make all of these behave the same,
but that's out of scope for this patch. For now the test just expects
the fallback behavior.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 21:05:23 +00:00
|
|
|
if (chunk_offset % expected_alignment != 0) {
|
|
|
|
error(_("chunk id %"PRIx32" not %d-byte aligned"),
|
|
|
|
chunk_id, expected_alignment);
|
|
|
|
return 1;
|
|
|
|
}
|
2021-02-18 14:07:34 +00:00
|
|
|
|
|
|
|
table_of_contents += CHUNK_TOC_ENTRY_SIZE;
|
|
|
|
next_chunk_offset = get_be64(table_of_contents + 4);
|
|
|
|
|
|
|
|
if (next_chunk_offset < chunk_offset ||
|
|
|
|
next_chunk_offset > mfile_size - the_hash_algo->rawsz) {
|
|
|
|
error(_("improper chunk offset(s) %"PRIx64" and %"PRIx64""),
|
|
|
|
chunk_offset, next_chunk_offset);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2021-02-18 14:07:38 +00:00
|
|
|
for (i = 0; i < cf->chunks_nr; i++) {
|
|
|
|
if (cf->chunks[i].id == chunk_id) {
|
|
|
|
error(_("duplicate chunk ID %"PRIx32" found"),
|
|
|
|
chunk_id);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-18 14:07:34 +00:00
|
|
|
cf->chunks[cf->chunks_nr].id = chunk_id;
|
|
|
|
cf->chunks[cf->chunks_nr].start = mfile + chunk_offset;
|
|
|
|
cf->chunks[cf->chunks_nr].size = next_chunk_offset - chunk_offset;
|
|
|
|
cf->chunks_nr++;
|
|
|
|
}
|
|
|
|
|
|
|
|
chunk_id = get_be32(table_of_contents);
|
|
|
|
if (chunk_id) {
|
|
|
|
error(_("final chunk has non-zero id %"PRIx32""), chunk_id);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
chunk-format: note that pair_chunk() is unsafe
The pair_chunk() function is provided as an easy helper for parsing
chunks that just want a pointer to a set of bytes. But every caller has
a hidden bug: because we return only the pointer without the matching
chunk size, the callers have no clue how many bytes they are allowed to
look at. And as a result, they may read off the end of the mmap'd data
when the on-disk file does not match their expectations.
Since chunk files are typically used for local-repository data like
commit-graph files and midx's, the security implications here are pretty
mild. The worst that can happen is that you hand somebody a corrupted
repository tarball, and running Git on it does an out-of-bounds read and
crashes. So it's worth being more defensive, but we don't need to drop
everything and fix every caller immediately.
I noticed the problem because the pair_chunk_fn() callback does not look
at its chunk_size argument, and wanted to annotate it to silence
-Wunused-parameter. We could do that now, but we'd lose the hint that
this code should be audited and fixed.
So instead, let's set ourselves up for going down that path:
1. Provide a pair_chunk() function that does return the size, which
prepares us for fixing these cases.
2. Rename the existing function to pair_chunk_unsafe(). That gives us
an easy way to grep for cases which still need to be fixed, and the
name should cause anybody adding new calls to think twice before
using it.
There are no callers of the "safe" version yet, but we'll add some in
subsequent patches.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 20:58:23 +00:00
|
|
|
struct pair_chunk_data {
|
|
|
|
const unsigned char **p;
|
|
|
|
size_t *size;
|
|
|
|
};
|
|
|
|
|
2021-02-18 14:07:34 +00:00
|
|
|
static int pair_chunk_fn(const unsigned char *chunk_start,
|
|
|
|
size_t chunk_size,
|
|
|
|
void *data)
|
|
|
|
{
|
chunk-format: note that pair_chunk() is unsafe
The pair_chunk() function is provided as an easy helper for parsing
chunks that just want a pointer to a set of bytes. But every caller has
a hidden bug: because we return only the pointer without the matching
chunk size, the callers have no clue how many bytes they are allowed to
look at. And as a result, they may read off the end of the mmap'd data
when the on-disk file does not match their expectations.
Since chunk files are typically used for local-repository data like
commit-graph files and midx's, the security implications here are pretty
mild. The worst that can happen is that you hand somebody a corrupted
repository tarball, and running Git on it does an out-of-bounds read and
crashes. So it's worth being more defensive, but we don't need to drop
everything and fix every caller immediately.
I noticed the problem because the pair_chunk_fn() callback does not look
at its chunk_size argument, and wanted to annotate it to silence
-Wunused-parameter. We could do that now, but we'd lose the hint that
this code should be audited and fixed.
So instead, let's set ourselves up for going down that path:
1. Provide a pair_chunk() function that does return the size, which
prepares us for fixing these cases.
2. Rename the existing function to pair_chunk_unsafe(). That gives us
an easy way to grep for cases which still need to be fixed, and the
name should cause anybody adding new calls to think twice before
using it.
There are no callers of the "safe" version yet, but we'll add some in
subsequent patches.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 20:58:23 +00:00
|
|
|
struct pair_chunk_data *pcd = data;
|
|
|
|
*pcd->p = chunk_start;
|
|
|
|
*pcd->size = chunk_size;
|
2021-02-18 14:07:34 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pair_chunk(struct chunkfile *cf,
|
|
|
|
uint32_t chunk_id,
|
chunk-format: note that pair_chunk() is unsafe
The pair_chunk() function is provided as an easy helper for parsing
chunks that just want a pointer to a set of bytes. But every caller has
a hidden bug: because we return only the pointer without the matching
chunk size, the callers have no clue how many bytes they are allowed to
look at. And as a result, they may read off the end of the mmap'd data
when the on-disk file does not match their expectations.
Since chunk files are typically used for local-repository data like
commit-graph files and midx's, the security implications here are pretty
mild. The worst that can happen is that you hand somebody a corrupted
repository tarball, and running Git on it does an out-of-bounds read and
crashes. So it's worth being more defensive, but we don't need to drop
everything and fix every caller immediately.
I noticed the problem because the pair_chunk_fn() callback does not look
at its chunk_size argument, and wanted to annotate it to silence
-Wunused-parameter. We could do that now, but we'd lose the hint that
this code should be audited and fixed.
So instead, let's set ourselves up for going down that path:
1. Provide a pair_chunk() function that does return the size, which
prepares us for fixing these cases.
2. Rename the existing function to pair_chunk_unsafe(). That gives us
an easy way to grep for cases which still need to be fixed, and the
name should cause anybody adding new calls to think twice before
using it.
There are no callers of the "safe" version yet, but we'll add some in
subsequent patches.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 20:58:23 +00:00
|
|
|
const unsigned char **p,
|
|
|
|
size_t *size)
|
|
|
|
{
|
|
|
|
struct pair_chunk_data pcd = { .p = p, .size = size };
|
|
|
|
return read_chunk(cf, chunk_id, pair_chunk_fn, &pcd);
|
|
|
|
}
|
|
|
|
|
2021-02-18 14:07:34 +00:00
|
|
|
int read_chunk(struct chunkfile *cf,
|
|
|
|
uint32_t chunk_id,
|
|
|
|
chunk_read_fn fn,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < cf->chunks_nr; i++) {
|
|
|
|
if (cf->chunks[i].id == chunk_id)
|
|
|
|
return fn(cf->chunks[i].start, cf->chunks[i].size, data);
|
|
|
|
}
|
|
|
|
|
|
|
|
return CHUNK_NOT_FOUND;
|
|
|
|
}
|
2022-05-20 23:17:41 +00:00
|
|
|
|
|
|
|
uint8_t oid_version(const struct git_hash_algo *algop)
|
|
|
|
{
|
|
|
|
switch (hash_algo_by_ptr(algop)) {
|
|
|
|
case GIT_HASH_SHA1:
|
|
|
|
return 1;
|
|
|
|
case GIT_HASH_SHA256:
|
|
|
|
return 2;
|
|
|
|
default:
|
|
|
|
die(_("invalid hash version"));
|
|
|
|
}
|
|
|
|
}
|