From 211c61c6cf56699fc7b520a0206d23f5e4a7261b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Tue, 5 Jul 2016 19:05:54 +0200 Subject: [PATCH 1/7] pack-objects: pass length to check_pack_crc() without truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32 bit systems with large file support, unsigned long is 32-bit while the two offsets in the subtraction expression (pack-objects has the exact same expression as in sha1_file.c but not shown in diff) are in 64-bit. If an in-pack object is larger than 2^32 len/datalen is truncated and we get a misleading "error: bad packed object CRC for ..." as a result. Use off_t for len and datalen. check_pack_crc() already accepts this argument as off_t and can deal with 4+ GB. Noticed-by: Christoph Michelbach Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 2 +- sha1_file.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index b6664ce4d8..a3a98c5741 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -349,7 +349,7 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry struct revindex_entry *revidx; off_t offset; enum object_type type = entry->type; - unsigned long datalen; + off_t datalen; unsigned char header[10], dheader[10]; unsigned hdrlen; diff --git a/sha1_file.c b/sha1_file.c index d0f2aa029b..cd9b560e70 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2282,7 +2282,7 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset, if (do_check_packed_object_crc && p->index_version > 1) { struct revindex_entry *revidx = find_pack_revindex(p, obj_offset); - unsigned long len = revidx[1].offset - obj_offset; + off_t len = revidx[1].offset - obj_offset; if (check_pack_crc(p, &w_curs, obj_offset, len, revidx->nr)) { const unsigned char *sha1 = nth_packed_object_sha1(p, revidx->nr); From 166df26f2821ea23b7c269a32fd63be43a2a0bb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Jul 2016 17:43:59 +0200 Subject: [PATCH 2/7] sha1_file.c: use type off_t* for object_info->disk_sizep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This field, filled by sha1_object_info() contains the on-disk size of an object, which could go over 4GB limit of unsigned long on 32-bit systems. Use off_t for it instead and update all callers. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 4 ++-- cache.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 54db1184a0..13ed944d2d 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -131,7 +131,7 @@ struct expand_data { unsigned char sha1[20]; enum object_type type; unsigned long size; - unsigned long disk_size; + off_t disk_size; const char *rest; unsigned char delta_base_sha1[20]; @@ -184,7 +184,7 @@ static void expand_atom(struct strbuf *sb, const char *atom, int len, if (data->mark_query) data->info.disk_sizep = &data->disk_size; else - strbuf_addf(sb, "%lu", data->disk_size); + strbuf_addf(sb, "%"PRIuMAX, (uintmax_t)data->disk_size); } else if (is_atom("rest", atom, len)) { if (data->mark_query) data->split_on_whitespace = 1; diff --git a/cache.h b/cache.h index 4ff196c259..ea64b51846 100644 --- a/cache.h +++ b/cache.h @@ -1502,7 +1502,7 @@ struct object_info { /* Request */ enum object_type *typep; unsigned long *sizep; - unsigned long *disk_sizep; + off_t *disk_sizep; unsigned char *delta_base_sha1; struct strbuf *typename; From 7171a0b0cf5792fd549b601c84b274cd5e4155ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Jul 2016 17:44:00 +0200 Subject: [PATCH 3/7] index-pack: correct "len" type in unpack_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit systems with large file support, one entry could be larger than 4GB and overflow "len". Correct it so we can unpack a full entry. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index e8c71fc1d2..cafaab7b13 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -549,13 +549,13 @@ static void *unpack_data(struct object_entry *obj, void *cb_data) { off_t from = obj[0].idx.offset + obj[0].hdr_size; - unsigned long len = obj[1].idx.offset - from; + off_t len = obj[1].idx.offset - from; unsigned char *data, *inbuf; git_zstream stream; int status; data = xmallocz(consume ? 64*1024 : obj->size); - inbuf = xmalloc((len < 64*1024) ? len : 64*1024); + inbuf = xmalloc((len < 64*1024) ? (int)len : 64*1024); memset(&stream, 0, sizeof(stream)); git_inflate_init(&stream); @@ -563,15 +563,15 @@ static void *unpack_data(struct object_entry *obj, stream.avail_out = consume ? 64*1024 : obj->size; do { - ssize_t n = (len < 64*1024) ? len : 64*1024; + ssize_t n = (len < 64*1024) ? (ssize_t)len : 64*1024; n = xpread(get_thread_data()->pack_fd, inbuf, n, from); if (n < 0) die_errno(_("cannot pread pack file")); if (!n) - die(Q_("premature end of pack file, %lu byte missing", - "premature end of pack file, %lu bytes missing", - len), - len); + die(Q_("premature end of pack file, %"PRIuMAX" byte missing", + "premature end of pack file, %"PRIuMAX" bytes missing", + (unsigned int)len), + (uintmax_t)len); from += n; len -= n; stream.next_in = inbuf; From fd3e67474c0b349049ccff5f72a50ef930a56013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Jul 2016 17:44:01 +0200 Subject: [PATCH 4/7] index-pack: report correct bad object offsets even if they are large MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the right type for offsets in this case, off_t, which makes a difference on 32-bit systems with large file support, and change formatting code accordingly. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index cafaab7b13..e2d8ae4a02 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -338,10 +338,10 @@ static void parse_pack_header(void) use(sizeof(struct pack_header)); } -static NORETURN void bad_object(unsigned long offset, const char *format, +static NORETURN void bad_object(off_t offset, const char *format, ...) __attribute__((format (printf, 2, 3))); -static NORETURN void bad_object(unsigned long offset, const char *format, ...) +static NORETURN void bad_object(off_t offset, const char *format, ...) { va_list params; char buf[1024]; @@ -349,7 +349,8 @@ static NORETURN void bad_object(unsigned long offset, const char *format, ...) va_start(params, format); vsnprintf(buf, sizeof(buf), format, params); va_end(params); - die(_("pack has bad object at offset %lu: %s"), offset, buf); + die(_("pack has bad object at offset %"PRIuMAX": %s"), + (uintmax_t)offset, buf); } static inline struct thread_local *get_thread_data(void) From da49a7da3ad5e31fa858a6d48d8a6af9c4690724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Jul 2016 17:44:02 +0200 Subject: [PATCH 5/7] index-pack: correct "offset" type in unpack_entry_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unpack_entry_data() receives an off_t value from unpack_raw_entry(), which could be larger than unsigned long on 32-bit systems with large file support. Correct the type so truncation does not happen. This only affects bad object reporting though. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index e2d8ae4a02..1008d7f63c 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -430,7 +430,7 @@ static int is_delta_type(enum object_type type) return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); } -static void *unpack_entry_data(unsigned long offset, unsigned long size, +static void *unpack_entry_data(off_t offset, unsigned long size, enum object_type type, unsigned char *sha1) { static char fixed_buf[8192]; From af92a645d30b9ac775cdfe5dd56ea1d66fb6e492 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Jul 2016 17:44:03 +0200 Subject: [PATCH 6/7] pack-objects: do not truncate result in-pack object size on 32-bit systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A typical diff will not show what's going on and you need to see full functions. The core code is like this, at the end of of write_one() e->idx.offset = *offset; size = write_object(f, e, *offset); if (!size) { e->idx.offset = recursing; return WRITE_ONE_BREAK; } written_list[nr_written++] = &e->idx; /* make sure off_t is sufficiently large not to wrap */ if (signed_add_overflows(*offset, size)) die("pack too large for current definition of off_t"); *offset += size; Here we can see that the in-pack object size is returned by write_object (or indirectly by write_reuse_object). And it's used to calculate object offsets, which end up in the pack index file, generated at the end. If "size" overflows (on 32-bit sytems, unsigned long is 32-bit while off_t can be 64-bit), we got wrong offsets and produce incorrect .idx file, which may make it look like the .pack file is corrupted. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index a3a98c5741..ac7a3a5895 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -341,8 +341,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent } /* Return 0 if we will bust the pack-size limit */ -static unsigned long write_reuse_object(struct sha1file *f, struct object_entry *entry, - unsigned long limit, int usable_delta) +static off_t write_reuse_object(struct sha1file *f, struct object_entry *entry, + unsigned long limit, int usable_delta) { struct packed_git *p = entry->in_pack; struct pack_window *w_curs = NULL; @@ -415,11 +415,12 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry } /* Return 0 if we will bust the pack-size limit */ -static unsigned long write_object(struct sha1file *f, - struct object_entry *entry, - off_t write_offset) +static off_t write_object(struct sha1file *f, + struct object_entry *entry, + off_t write_offset) { - unsigned long limit, len; + unsigned long limit; + off_t len; int usable_delta, to_reuse; if (!pack_to_stdout) @@ -491,7 +492,7 @@ static enum write_one_status write_one(struct sha1file *f, struct object_entry *e, off_t *offset) { - unsigned long size; + off_t size; int recursing; /* From ec9d224903053e045d99c36149703501098b021c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Jul 2016 17:44:04 +0200 Subject: [PATCH 7/7] fsck: use streaming interface for large blobs in pack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For blobs, we want to make sure the on-disk data is not corrupted (i.e. can be inflated and produce the expected SHA-1). Blob content is opaque, there's nothing else inside to check for. For really large blobs, we may want to avoid unpacking the entire blob in memory, just to check whether it produces the same SHA-1. On 32-bit systems, we may not have enough virtual address space for such memory allocation. And even on 64-bit where it's not a problem, allocating a lot more memory could result in kicking other parts of systems to swap file, generating lots of I/O and slowing everything down. For this particular operation, not unpacking the blob and letting check_sha1_signature, which supports streaming interface, do the job is sufficient. check_sha1_signature() is not shown in the diff, unfortunately. But if will be called when "data_valid && !data" is false. We will call the callback function "fn" with NULL as "data". The only callback of this function is fsck_obj_buffer(), which does not touch "data" at all if it's a blob. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/fsck.c | 4 ++++ pack-check.c | 23 +++++++++++++++++++++-- pack.h | 1 + t/t1050-large.sh | 7 +++---- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index 55eac756f7..b08bc8be24 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -356,6 +356,10 @@ static int fsck_sha1(const unsigned char *sha1) static int fsck_obj_buffer(const unsigned char *sha1, enum object_type type, unsigned long size, void *buffer, int *eaten) { + /* + * Note, buffer may be NULL if type is OBJ_BLOB. See + * verify_packfile(), data_valid variable for details. + */ struct object *obj; obj = parse_object_buffer(sha1, type, size, buffer, eaten); if (!obj) { diff --git a/pack-check.c b/pack-check.c index 1da89a41ce..d123846ea2 100644 --- a/pack-check.c +++ b/pack-check.c @@ -105,6 +105,8 @@ static int verify_packfile(struct packed_git *p, void *data; enum object_type type; unsigned long size; + off_t curpos; + int data_valid; if (p->index_version > 1) { off_t offset = entries[i].offset; @@ -116,8 +118,25 @@ static int verify_packfile(struct packed_git *p, sha1_to_hex(entries[i].sha1), p->pack_name, (uintmax_t)offset); } - data = unpack_entry(p, entries[i].offset, &type, &size); - if (!data) + + curpos = entries[i].offset; + type = unpack_object_header(p, w_curs, &curpos, &size); + unuse_pack(w_curs); + + if (type == OBJ_BLOB && big_file_threshold <= size) { + /* + * Let check_sha1_signature() check it with + * the streaming interface; no point slurping + * the data in-core only to discard. + */ + data = NULL; + data_valid = 0; + } else { + data = unpack_entry(p, entries[i].offset, &type, &size); + data_valid = 1; + } + + if (data_valid && !data) err = error("cannot unpack %s from %s at offset %"PRIuMAX"", sha1_to_hex(entries[i].sha1), p->pack_name, (uintmax_t)entries[i].offset); diff --git a/pack.h b/pack.h index 3223f5a038..0e77429df5 100644 --- a/pack.h +++ b/pack.h @@ -74,6 +74,7 @@ struct pack_idx_entry { struct progress; +/* Note, the data argument could be NULL if object type is blob */ typedef int (*verify_fn)(const unsigned char*, enum object_type, unsigned long, void*, int*); extern const char *write_idx_file(const char *index_name, struct pack_idx_entry **objects, int nr_objects, const struct pack_idx_option *, const unsigned char *sha1); diff --git a/t/t1050-large.sh b/t/t1050-large.sh index f9f3d1391f..096dbffecc 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -177,10 +177,9 @@ test_expect_success 'zip achiving, deflate' ' git archive --format=zip HEAD >/dev/null ' -test_expect_success 'fsck' ' - test_must_fail git fsck 2>err && - n=$(grep "error: attempting to allocate .* over limit" err | wc -l) && - test "$n" -gt 1 +test_expect_success 'fsck large blobs' ' + git fsck 2>err && + test_must_be_empty err ' test_done