From 12f19a9825686e3641803580bda4e2ab2abd44ed Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Wed, 3 Oct 2018 16:04:52 -0700 Subject: [PATCH 1/2] fetch-pack: avoid object flags if no_dependents When fetch_pack() is invoked as part of another Git command (due to a lazy fetch from a partial clone, for example), it uses object flags that may already be used by the outer Git command. The commit that introduced the lazy fetch feature (88e2f9ed8e ("introduce fetch-object: fetch one promisor object", 2017-12-05)) tried to avoid this overlap, but it did not avoid it totally. It was successful in avoiding writing COMPLETE, but did not avoid reading COMPLETE, and did not avoid writing and reading ALTERNATE. Ensure that no flags are written or read by fetch_pack() in the case where it is used to perform a lazy fetch. To do this, it is sufficient to avoid checking completeness of wanted refs (unnecessary in the case of lazy fetches), and to avoid negotiation-related work (in the current implementation, already, no negotiation is performed). After that was done, the lack of overlap was verified by checking all direct and indirect usages of COMPLETE and ALTERNATE - that they are read or written only if no_dependents is false. There are other possible solutions to this issue: (1) Split fetch-pack.{c,h} into a flag-using part and a non-flag-using part, and whenever no_dependents is set, only use the non-flag-using part. (2) Make fetch_pack() be able to be used with arbitrary repository objects. fetch_pack() should then create its own repository object based on the given repository object, with its own object hashtable, so that the flags do not conflict. (1) is possible but invasive - some functions would need to be split; and such invasiveness would potentially be unnecessary if we ever were to need (2) anyway. (2) would be useful if we were to support, say, submodules that were partial clones themselves, but I don't know when or if the Git project plans to support those. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- fetch-pack.c | 101 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 42 deletions(-) diff --git a/fetch-pack.c b/fetch-pack.c index 88a078e9be..973d72f367 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -253,8 +253,10 @@ static int find_common(struct fetch_negotiator *negotiator, if (args->stateless_rpc && multi_ack == 1) die(_("--stateless-rpc requires multi_ack_detailed")); - mark_tips(negotiator, args->negotiation_tips); - for_each_cached_alternate(negotiator, insert_one_alternate_object); + if (!args->no_dependents) { + mark_tips(negotiator, args->negotiation_tips); + for_each_cached_alternate(negotiator, insert_one_alternate_object); + } fetching = 0; for ( ; refs ; refs = refs->next) { @@ -271,8 +273,12 @@ static int find_common(struct fetch_negotiator *negotiator, * We use lookup_object here because we are only * interested in the case we *know* the object is * reachable and we have already scanned it. + * + * Do this only if args->no_dependents is false (if it is true, + * we cannot trust the object flags). */ - if (((o = lookup_object(the_repository, remote->hash)) != NULL) && + if (!args->no_dependents && + ((o = lookup_object(the_repository, remote->hash)) != NULL) && (o->flags & COMPLETE)) { continue; } @@ -710,31 +716,29 @@ static void mark_complete_and_common_ref(struct fetch_negotiator *negotiator, oidset_clear(&loose_oid_set); - if (!args->no_dependents) { - if (!args->deepen) { - for_each_ref(mark_complete_oid, NULL); - for_each_cached_alternate(NULL, mark_alternate_complete); - commit_list_sort_by_date(&complete); - if (cutoff) - mark_recent_complete_commits(args, cutoff); - } + if (!args->deepen) { + for_each_ref(mark_complete_oid, NULL); + for_each_cached_alternate(NULL, mark_alternate_complete); + commit_list_sort_by_date(&complete); + if (cutoff) + mark_recent_complete_commits(args, cutoff); + } - /* - * Mark all complete remote refs as common refs. - * Don't mark them common yet; the server has to be told so first. - */ - for (ref = *refs; ref; ref = ref->next) { - struct object *o = deref_tag(the_repository, - lookup_object(the_repository, - ref->old_oid.hash), - NULL, 0); + /* + * Mark all complete remote refs as common refs. + * Don't mark them common yet; the server has to be told so first. + */ + for (ref = *refs; ref; ref = ref->next) { + struct object *o = deref_tag(the_repository, + lookup_object(the_repository, + ref->old_oid.hash), + NULL, 0); - if (!o || o->type != OBJ_COMMIT || !(o->flags & COMPLETE)) - continue; + if (!o || o->type != OBJ_COMMIT || !(o->flags & COMPLETE)) + continue; - negotiator->known_common(negotiator, - (struct commit *)o); - } + negotiator->known_common(negotiator, + (struct commit *)o); } save_commit_buffer = old_save_commit_buffer; @@ -990,11 +994,15 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args, if (!server_supports("deepen-relative") && args->deepen_relative) die(_("Server does not support --deepen")); - mark_complete_and_common_ref(&negotiator, args, &ref); - filter_refs(args, &ref, sought, nr_sought); - if (everything_local(args, &ref)) { - packet_flush(fd[1]); - goto all_done; + if (!args->no_dependents) { + mark_complete_and_common_ref(&negotiator, args, &ref); + filter_refs(args, &ref, sought, nr_sought); + if (everything_local(args, &ref)) { + packet_flush(fd[1]); + goto all_done; + } + } else { + filter_refs(args, &ref, sought, nr_sought); } if (find_common(&negotiator, args, fd, &oid, ref) < 0) if (!args->keep_pack) @@ -1040,7 +1048,7 @@ static void add_shallow_requests(struct strbuf *req_buf, } } -static void add_wants(const struct ref *wants, struct strbuf *req_buf) +static void add_wants(int no_dependents, const struct ref *wants, struct strbuf *req_buf) { int use_ref_in_want = server_supports_feature("fetch", "ref-in-want", 0); @@ -1057,8 +1065,12 @@ static void add_wants(const struct ref *wants, struct strbuf *req_buf) * We use lookup_object here because we are only * interested in the case we *know* the object is * reachable and we have already scanned it. + * + * Do this only if args->no_dependents is false (if it is true, + * we cannot trust the object flags). */ - if (((o = lookup_object(the_repository, remote->hash)) != NULL) && + if (!no_dependents && + ((o = lookup_object(the_repository, remote->hash)) != NULL) && (o->flags & COMPLETE)) { continue; } @@ -1155,7 +1167,7 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out, } /* add wants */ - add_wants(wants, &req_buf); + add_wants(args->no_dependents, wants, &req_buf); if (args->no_dependents) { packet_buf_write(&req_buf, "done"); @@ -1346,16 +1358,21 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, args->deepen = 1; /* Filter 'ref' by 'sought' and those that aren't local */ - mark_complete_and_common_ref(&negotiator, args, &ref); - filter_refs(args, &ref, sought, nr_sought); - if (everything_local(args, &ref)) - state = FETCH_DONE; - else - state = FETCH_SEND_REQUEST; + if (!args->no_dependents) { + mark_complete_and_common_ref(&negotiator, args, &ref); + filter_refs(args, &ref, sought, nr_sought); + if (everything_local(args, &ref)) + state = FETCH_DONE; + else + state = FETCH_SEND_REQUEST; - mark_tips(&negotiator, args->negotiation_tips); - for_each_cached_alternate(&negotiator, - insert_one_alternate_object); + mark_tips(&negotiator, args->negotiation_tips); + for_each_cached_alternate(&negotiator, + insert_one_alternate_object); + } else { + filter_refs(args, &ref, sought, nr_sought); + state = FETCH_SEND_REQUEST; + } break; case FETCH_SEND_REQUEST: if (send_fetch_request(&negotiator, fd[1], args, ref, From 4c7f9567ea2628451a0f4ad52599a25d34657bae Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Wed, 3 Oct 2018 16:04:53 -0700 Subject: [PATCH 2/2] fetch-pack: exclude blobs when lazy-fetching trees A partial clone with missing trees can be obtained using "git clone --filter=tree:none ". In such a repository, when a tree needs to be lazily fetched, any tree or blob it directly or indirectly references is fetched as well, regardless of whether the original command required those objects, or if the local repository already had some of them. This is because the fetch protocol, which the lazy fetch uses, does not allow clients to request that only the wanted objects be sent, which would be the ideal solution. This patch implements a partial solution: specify the "blob:none" filter, somewhat reducing the fetch payload. This change has no effect when lazily fetching blobs (due to how filters work). And if lazily fetching a commit (such repositories are difficult to construct and is not a use case we support very well, but it is possible), referenced commits and trees are still fetched - only the blobs are not fetched. The necessary code change is done in fetch_pack() instead of somewhere closer to where the "filter" instruction is written to the wire so that only one part of the code needs to be changed in order for users of all protocol versions to benefit from this optimization. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- fetch-pack.c | 14 ++++++++++++++ fetch-pack.h | 7 +++++++ t/t0410-partial-clone.sh | 41 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/fetch-pack.c b/fetch-pack.c index 973d72f367..79007f996c 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -1615,6 +1615,20 @@ struct ref *fetch_pack(struct fetch_pack_args *args, if (nr_sought) nr_sought = remove_duplicates_in_refs(sought, nr_sought); + if (args->no_dependents && !args->filter_options.choice) { + /* + * The protocol does not support requesting that only the + * wanted objects be sent, so approximate this by setting a + * "blob:none" filter if no filter is already set. This works + * for all object types: note that wanted blobs will still be + * sent because they are directly specified as a "want". + * + * NEEDSWORK: Add an option in the protocol to request that + * only the wanted objects be sent, and implement it. + */ + parse_list_objects_filter(&args->filter_options, "blob:none"); + } + if (!ref) { packet_flush(fd[1]); die(_("no matching remote head")); diff --git a/fetch-pack.h b/fetch-pack.h index 5b6e868802..43ec344d95 100644 --- a/fetch-pack.h +++ b/fetch-pack.h @@ -43,6 +43,13 @@ struct fetch_pack_args { unsigned from_promisor:1; /* + * Attempt to fetch only the wanted objects, and not any objects + * referred to by them. Due to protocol limitations, extraneous + * objects may still be included. (When fetching non-blob + * objects, only blobs are excluded; when fetching a blob, the + * blob itself will still be sent. The client does not need to + * know whether a wanted object is a blob or not.) + * * If 1, fetch_pack() will also not modify any object flags. * This allows fetch_pack() to safely be called by any function, * regardless of which object flags it uses (if any). diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh index 1281300664..08a0c3651c 100755 --- a/t/t0410-partial-clone.sh +++ b/t/t0410-partial-clone.sh @@ -170,6 +170,47 @@ test_expect_success 'fetching of missing objects' ' git verify-pack --verbose "$IDX" | grep "$HASH" ' +test_expect_success 'fetching of missing blobs works' ' + rm -rf server repo && + test_create_repo server && + test_commit -C server foo && + git -C server repack -a -d --write-bitmap-index && + + git clone "file://$(pwd)/server" repo && + git hash-object repo/foo.t >blobhash && + rm -rf repo/.git/objects/* && + + git -C server config uploadpack.allowanysha1inwant 1 && + git -C server config uploadpack.allowfilter 1 && + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialclone "origin" && + + git -C repo cat-file -p $(cat blobhash) +' + +test_expect_success 'fetching of missing trees does not fetch blobs' ' + rm -rf server repo && + test_create_repo server && + test_commit -C server foo && + git -C server repack -a -d --write-bitmap-index && + + git clone "file://$(pwd)/server" repo && + git -C repo rev-parse foo^{tree} >treehash && + git hash-object repo/foo.t >blobhash && + rm -rf repo/.git/objects/* && + + git -C server config uploadpack.allowanysha1inwant 1 && + git -C server config uploadpack.allowfilter 1 && + git -C repo config core.repositoryformatversion 1 && + git -C repo config extensions.partialclone "origin" && + git -C repo cat-file -p $(cat treehash) && + + # Ensure that the tree, but not the blob, is fetched + git -C repo rev-list --objects --missing=print $(cat treehash) >objects && + grep "^$(cat treehash)" objects && + grep "^[?]$(cat blobhash)" objects +' + test_expect_success 'rev-list stops traversal at missing and promised commit' ' rm -rf repo && test_create_repo repo &&