From 7056e7f0c8e55d5c3fae5648d5434d83436e8977 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 7 Nov 2022 18:35:44 +0000 Subject: [PATCH] chunk-format: allow trailing table of contents The existing chunk formats use the table of contents at the beginning of the file. This is intended as a way to speed up the initial loading of the file, but comes at a cost during writes. Each example needs to fully compute how big each chunk will be in advance, which usually requires storing the full file contents in memory. Future file formats may want to use the chunk format API in cases where the writing stage is critical to performance, so we may want to stream updates from an existing file and then only write the table of contents at the end. Add a new 'flags' parameter to write_chunkfile() that allows this behavior. When this is specified, the defensive programming that checks that the chunks are written with the precomputed sizes is disabled. Then, the table of contents is written in reverse order at the end of the hashfile, so a parser can read the chunk list starting from the end of the file (minus the hash). The parsing of these table of contents will come in a later change. Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau --- chunk-format.c | 53 +++++++++++++++++++++++++++++++++++--------------- chunk-format.h | 9 ++++++++- commit-graph.c | 2 +- midx.c | 2 +- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/chunk-format.c b/chunk-format.c index f1b2c8a8b3..3f5cc9b5dd 100644 --- a/chunk-format.c +++ b/chunk-format.c @@ -57,27 +57,32 @@ void add_chunk(struct chunkfile *cf, cf->chunks_nr++; } -int write_chunkfile(struct chunkfile *cf, void *data) +int write_chunkfile(struct chunkfile *cf, + enum chunkfile_flags flags, + void *data) { int i, result = 0; - uint64_t cur_offset = hashfile_total(cf->f); trace2_region_enter("chunkfile", "write", the_repository); - /* Add the table of contents to the current offset */ - cur_offset += (cf->chunks_nr + 1) * CHUNK_TOC_ENTRY_SIZE; + if (!(flags & CHUNKFILE_TRAILING_TOC)) { + uint64_t cur_offset = hashfile_total(cf->f); - for (i = 0; i < cf->chunks_nr; i++) { - hashwrite_be32(cf->f, cf->chunks[i].id); + /* Add the table of contents to the current offset */ + cur_offset += (cf->chunks_nr + 1) * CHUNK_TOC_ENTRY_SIZE; + + for (i = 0; i < cf->chunks_nr; i++) { + hashwrite_be32(cf->f, cf->chunks[i].id); + hashwrite_be64(cf->f, cur_offset); + + cur_offset += cf->chunks[i].size; + } + + /* Trailing entry marks the end of the chunks */ + hashwrite_be32(cf->f, 0); hashwrite_be64(cf->f, cur_offset); - - cur_offset += cf->chunks[i].size; } - /* Trailing entry marks the end of the chunks */ - hashwrite_be32(cf->f, 0); - hashwrite_be64(cf->f, cur_offset); - for (i = 0; i < cf->chunks_nr; i++) { cf->chunks[i].offset = hashfile_total(cf->f); result = cf->chunks[i].write_fn(cf->f, data); @@ -85,10 +90,26 @@ int write_chunkfile(struct chunkfile *cf, void *data) if (result) goto cleanup; - if (hashfile_total(cf->f) - cf->chunks[i].offset != cf->chunks[i].size) - BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", - cf->chunks[i].size, cf->chunks[i].id, - hashfile_total(cf->f) - cf->chunks[i].offset); + if (!(flags & CHUNKFILE_TRAILING_TOC)) { + if (hashfile_total(cf->f) - cf->chunks[i].offset != cf->chunks[i].size) + BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", + cf->chunks[i].size, cf->chunks[i].id, + hashfile_total(cf->f) - cf->chunks[i].offset); + } + + cf->chunks[i].size = hashfile_total(cf->f) - cf->chunks[i].offset; + } + + if (flags & CHUNKFILE_TRAILING_TOC) { + size_t last_chunk_tail = hashfile_total(cf->f); + /* First entry marks the end of the chunks */ + hashwrite_be32(cf->f, 0); + hashwrite_be64(cf->f, last_chunk_tail); + + for (i = cf->chunks_nr - 1; i >= 0; i--) { + hashwrite_be32(cf->f, cf->chunks[i].id); + hashwrite_be64(cf->f, cf->chunks[i].offset); + } } cleanup: diff --git a/chunk-format.h b/chunk-format.h index 7885aa0848..39e8967e95 100644 --- a/chunk-format.h +++ b/chunk-format.h @@ -31,7 +31,14 @@ void add_chunk(struct chunkfile *cf, uint32_t id, size_t size, chunk_write_fn fn); -int write_chunkfile(struct chunkfile *cf, void *data); + +enum chunkfile_flags { + CHUNKFILE_TRAILING_TOC = (1 << 0), +}; + +int write_chunkfile(struct chunkfile *cf, + enum chunkfile_flags flags, + void *data); int read_table_of_contents(struct chunkfile *cf, const unsigned char *mfile, diff --git a/commit-graph.c b/commit-graph.c index a7d8755932..c927b81250 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1932,7 +1932,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) get_num_chunks(cf) * ctx->commits.nr); } - write_chunkfile(cf, ctx); + write_chunkfile(cf, 0, ctx); stop_progress(&ctx->progress); strbuf_release(&progress_title); diff --git a/midx.c b/midx.c index 7cfad04a24..03d947a5d3 100644 --- a/midx.c +++ b/midx.c @@ -1510,7 +1510,7 @@ static int write_midx_internal(const char *object_dir, } write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs); - write_chunkfile(cf, &ctx); + write_chunkfile(cf, 0, &ctx); finalize_hashfile(f, midx_hash, FSYNC_COMPONENT_PACK_METADATA, CSUM_FSYNC | CSUM_HASH_IN_STREAM);