From a0851cece542be9fabf00d6145860562db450205 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:52:47 +0200 Subject: [PATCH 01/13] path: harden validation of HEAD with non-standard hashes The `validate_headref()` function takes a path to a supposed "HEAD" file and checks whether its format is something that we understand. It is used as part of our repository discovery to check whether a specific directory is a Git directory or not. Part of the validation is a check for a detached HEAD that contains a plain object ID. To do this validation we use `get_oid_hex()`, which relies on `the_hash_algo`. At this point in time the hash algo cannot yet be initialized though because we didn't yet read the Git config. Consequently, it will always be the SHA1 hash algorithm. In practice this works alright because `get_oid_hex()` only ends up checking whether the prefix of the buffer is a valid object ID. And because SHA1 is shorter than SHA256, the function will successfully parse SHA256 object IDs, as well. It is somewhat fragile though and not really the intent to only check for SHA1. With this in mind, harden the code to use `get_oid_hex_any()` to check whether the "HEAD" file parses as any known hash. One might be hard pressed to tighten the check even further and fully validate the file contents, not only the prefix. In practice though that wouldn't make a lot of sense as it could be that the repository uses a hash function that produces longer hashes than SHA256, but which the current version of Git doesn't understand yet. We'd still want to detect the repository as proper Git repository in that case, and we will fail eventually with a proper error message that the hash isn't understood when trying to set up the repository format. It follows that we could just leave the current code intact, as in practice the code change doesn't have any user visible impact. But it also prepares us for `the_hash_algo` being unset when there is no repository. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- path.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/path.c b/path.c index 67229edb9c..cc02165530 100644 --- a/path.c +++ b/path.c @@ -693,7 +693,7 @@ int validate_headref(const char *path) /* * Is this a detached HEAD? */ - if (!get_oid_hex(buffer, &oid)) + if (get_oid_hex_any(buffer, &oid) != GIT_HASH_UNKNOWN) return 0; return -1; From 0c6bd2b81d1e18dfa1e143c354c554cca34b3685 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:52:51 +0200 Subject: [PATCH 02/13] path: move `validate_headref()` to its only user While `validate_headref()` is only called from `is_git_directory()` in "setup.c", it is currently implemented in "path.c". Move it over such that it becomes clear that it is only really used during setup in order to discover repositories. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- path.c | 53 ----------------------------------------------------- path.h | 1 - setup.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 54 deletions(-) diff --git a/path.c b/path.c index cc02165530..bd6e25245d 100644 --- a/path.c +++ b/path.c @@ -5,7 +5,6 @@ #include "abspath.h" #include "environment.h" #include "gettext.h" -#include "hex.h" #include "repository.h" #include "strbuf.h" #include "string-list.h" @@ -647,58 +646,6 @@ void strbuf_git_common_path(struct strbuf *sb, va_end(args); } -int validate_headref(const char *path) -{ - struct stat st; - char buffer[256]; - const char *refname; - struct object_id oid; - int fd; - ssize_t len; - - if (lstat(path, &st) < 0) - return -1; - - /* Make sure it is a "refs/.." symlink */ - if (S_ISLNK(st.st_mode)) { - len = readlink(path, buffer, sizeof(buffer)-1); - if (len >= 5 && !memcmp("refs/", buffer, 5)) - return 0; - return -1; - } - - /* - * Anything else, just open it and try to see if it is a symbolic ref. - */ - fd = open(path, O_RDONLY); - if (fd < 0) - return -1; - len = read_in_full(fd, buffer, sizeof(buffer)-1); - close(fd); - - if (len < 0) - return -1; - buffer[len] = '\0'; - - /* - * Is it a symbolic ref? - */ - if (skip_prefix(buffer, "ref:", &refname)) { - while (isspace(*refname)) - refname++; - if (starts_with(refname, "refs/")) - return 0; - } - - /* - * Is this a detached HEAD? - */ - if (get_oid_hex_any(buffer, &oid) != GIT_HASH_UNKNOWN) - return 0; - - return -1; -} - static struct passwd *getpw_str(const char *username, size_t len) { struct passwd *pw; diff --git a/path.h b/path.h index ea96487b29..c3bc8617bd 100644 --- a/path.h +++ b/path.h @@ -173,7 +173,6 @@ const char *git_path_fetch_head(struct repository *r); const char *git_path_shallow(struct repository *r); int ends_with_path_components(const char *path, const char *components); -int validate_headref(const char *ref); int calc_shared_perm(int mode); int adjust_shared_perm(const char *path); diff --git a/setup.c b/setup.c index f4b32f76e3..7c996659bd 100644 --- a/setup.c +++ b/setup.c @@ -4,6 +4,7 @@ #include "environment.h" #include "exec-cmd.h" #include "gettext.h" +#include "hex.h" #include "object-name.h" #include "refs.h" #include "repository.h" @@ -341,6 +342,58 @@ int get_common_dir_noenv(struct strbuf *sb, const char *gitdir) return ret; } +static int validate_headref(const char *path) +{ + struct stat st; + char buffer[256]; + const char *refname; + struct object_id oid; + int fd; + ssize_t len; + + if (lstat(path, &st) < 0) + return -1; + + /* Make sure it is a "refs/.." symlink */ + if (S_ISLNK(st.st_mode)) { + len = readlink(path, buffer, sizeof(buffer)-1); + if (len >= 5 && !memcmp("refs/", buffer, 5)) + return 0; + return -1; + } + + /* + * Anything else, just open it and try to see if it is a symbolic ref. + */ + fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + len = read_in_full(fd, buffer, sizeof(buffer)-1); + close(fd); + + if (len < 0) + return -1; + buffer[len] = '\0'; + + /* + * Is it a symbolic ref? + */ + if (skip_prefix(buffer, "ref:", &refname)) { + while (isspace(*refname)) + refname++; + if (starts_with(refname, "refs/")) + return 0; + } + + /* + * Is this a detached HEAD? + */ + if (get_oid_hex_any(buffer, &oid) != GIT_HASH_UNKNOWN) + return 0; + + return -1; +} + /* * Test if it looks like we're at a git directory. * We want to see: From b7afb462258ab02d94e437652cd230a7827d3329 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:52:56 +0200 Subject: [PATCH 03/13] parse-options-cb: only abbreviate hashes when hash algo is known The `OPT__ABBREV()` option can be used to add an option that abbreviates object IDs. When given a length longer than `the_hash_algo->hexsz`, then it will instead set the length to that maximum length. It may not always be guaranteed that we have `the_hash_algo` initialized properly as the hash algorithm can only be set up after we have set up `the_repository`. In that case, the hash would always be truncated to the hex length of SHA1, which may not be what the user desires. In practice it's not a problem as all commands that use `OPT__ABBREV()` also have `RUN_SETUP` set and thus cannot work without a repository. Consequently, both `the_repository` and `the_hash_algo` would be properly set up. Regardless of that, harden the code to not truncate the length when we didn't set up a repository. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- parse-options-cb.c | 3 ++- t/t0040-parse-options.sh | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/parse-options-cb.c b/parse-options-cb.c index bdc7fae497..d99d688d3c 100644 --- a/parse-options-cb.c +++ b/parse-options-cb.c @@ -7,6 +7,7 @@ #include "environment.h" #include "gettext.h" #include "object-name.h" +#include "setup.h" #include "string-list.h" #include "strvec.h" #include "oid-array.h" @@ -29,7 +30,7 @@ int parse_opt_abbrev_cb(const struct option *opt, const char *arg, int unset) opt->long_name); if (v && v < MINIMUM_ABBREV) v = MINIMUM_ABBREV; - else if (v > the_hash_algo->hexsz) + else if (startup_info->have_repository && v > the_hash_algo->hexsz) v = the_hash_algo->hexsz; } *(int *)(opt->value) = v; diff --git a/t/t0040-parse-options.sh b/t/t0040-parse-options.sh index 8bb2a8b453..45a773642f 100755 --- a/t/t0040-parse-options.sh +++ b/t/t0040-parse-options.sh @@ -176,6 +176,23 @@ test_expect_success 'long options' ' test_cmp expect output ' +test_expect_success 'abbreviate to something longer than SHA1 length' ' + cat >expect <<-EOF && + boolean: 0 + integer: 0 + magnitude: 0 + timestamp: 0 + string: (not set) + abbrev: 100 + verbose: -1 + quiet: 0 + dry run: no + file: (not set) + EOF + test-tool parse-options --abbrev=100 >output && + test_cmp expect output +' + test_expect_success 'missing required value' ' cat >expect <<-\EOF && error: switch `s'\'' requires a value From bbb82f8dc88aee588a35615fdb10862f3b41e16c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:01 +0200 Subject: [PATCH 04/13] attr: don't recompute default attribute source The `default_attr_source()` function lazily computes the attr source supposedly once, only. This is done via a static variable `attr_source` that contains the resolved object ID of the attr source's tree. If the variable is the null object ID then we try to look up the attr source, otherwise we skip over it. This approach is flawed though: the variable will never be set to anything else but the null object ID in case there is no attr source. Consequently, we re-compute the information on every call. And in the worst case, when we silently ignore bad trees, this will cause us to try and look up the treeish every single time. Improve this by introducing a separate variable `has_attr_source` to track whether we already computed the attr source and, if so, whether we have an attr source or not. This also allows us to convert the `ignore_bad_attr_tree` to not be static anymore as the code will only be executed once anyway. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- attr.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/attr.c b/attr.c index 6af7151088..a5b717e4ce 100644 --- a/attr.c +++ b/attr.c @@ -1206,15 +1206,16 @@ static void collect_some_attrs(struct index_state *istate, } static const char *default_attr_source_tree_object_name; -static int ignore_bad_attr_tree; void set_git_attr_source(const char *tree_object_name) { default_attr_source_tree_object_name = xstrdup(tree_object_name); } -static void compute_default_attr_source(struct object_id *attr_source) +static int compute_default_attr_source(struct object_id *attr_source) { + int ignore_bad_attr_tree = 0; + if (!default_attr_source_tree_object_name) default_attr_source_tree_object_name = getenv(GIT_ATTR_SOURCE_ENVIRONMENT); @@ -1223,22 +1224,28 @@ static void compute_default_attr_source(struct object_id *attr_source) ignore_bad_attr_tree = 1; } - if (!default_attr_source_tree_object_name || !is_null_oid(attr_source)) - return; + if (!default_attr_source_tree_object_name) + return 0; if (repo_get_oid_treeish(the_repository, default_attr_source_tree_object_name, - attr_source) && !ignore_bad_attr_tree) - die(_("bad --attr-source or GIT_ATTR_SOURCE")); + attr_source)) { + if (!ignore_bad_attr_tree) + die(_("bad --attr-source or GIT_ATTR_SOURCE")); + return 0; + } + + return 1; } static struct object_id *default_attr_source(void) { static struct object_id attr_source; + static int has_attr_source = -1; - if (is_null_oid(&attr_source)) - compute_default_attr_source(&attr_source); - if (is_null_oid(&attr_source)) + if (has_attr_source < 0) + has_attr_source = compute_default_attr_source(&attr_source); + if (!has_attr_source) return NULL; return &attr_source; } From 813f17fd6bd9289185a50f1d1179393bd1339b9b Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:05 +0200 Subject: [PATCH 05/13] attr: fix BUG() when parsing attrs outside of repo If either the `--attr-source` option or the `GIT_ATTR_SOURCE` envvar are set, then `compute_default_attr_source()` will try to look up the value as a treeish. It is possible to hit that function while outside of a Git repository though, for example when using `git grep --no-index`. In that case, Git will hit a bug because we try to look up the main ref store outside of a repository. Handle the case gracefully and detect when we try to look up an attr source without a repository. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- attr.c | 6 ++++++ t/t0003-attributes.sh | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/attr.c b/attr.c index a5b717e4ce..c89ab3478e 100644 --- a/attr.c +++ b/attr.c @@ -1227,6 +1227,12 @@ static int compute_default_attr_source(struct object_id *attr_source) if (!default_attr_source_tree_object_name) return 0; + if (!startup_info->have_repository) { + if (!ignore_bad_attr_tree) + die(_("cannot use --attr-source or GIT_ATTR_SOURCE without repo")); + return 0; + } + if (repo_get_oid_treeish(the_repository, default_attr_source_tree_object_name, attr_source)) { diff --git a/t/t0003-attributes.sh b/t/t0003-attributes.sh index d755cc3c29..72fadca1e8 100755 --- a/t/t0003-attributes.sh +++ b/t/t0003-attributes.sh @@ -434,6 +434,21 @@ test_expect_success 'precedence of --attr-source, GIT_ATTR_SOURCE, then attr.tre ) ' +test_expect_success 'diff without repository with attr source' ' + mkdir -p "$TRASH_DIRECTORY/outside/nongit" && + ( + cd "$TRASH_DIRECTORY/outside/nongit" && + GIT_CEILING_DIRECTORIES="$TRASH_DIRECTORY/outside" && + export GIT_CEILING_DIRECTORIES && + touch file && + cat >expect <<-EOF && + fatal: cannot use --attr-source or GIT_ATTR_SOURCE without repo + EOF + test_must_fail env GIT_ATTR_SOURCE=HEAD git grep --no-index foo file 2>err && + test_cmp expect err + ) +' + test_expect_success 'bare repository: with --source' ' ( cd bare.git && From bd455cec37551557e9122102606299c4bdc93505 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:10 +0200 Subject: [PATCH 06/13] remote-curl: fix parsing of detached SHA256 heads The dumb HTTP transport tries to read the remote HEAD reference by downloading the "HEAD" file and then parsing it via `http_fetch_ref()`. This function will either parse the file as an object ID in case it is exactly `the_hash_algo->hexsz` long, or otherwise it will check whether the reference starts with "ref :" and parse it as a symbolic ref. This is broken when parsing detached HEADs of a remote SHA256 repository because we never update `the_hash_algo` to the discovered remote object hash. Consequently, `the_hash_algo` will always be the fallback SHA1 hash algorithm, which will cause us to fail parsing HEAD altogteher when it contains a SHA256 object ID. Fix this issue by setting up `the_hash_algo` via `repo_set_hash_algo()`. While at it, let's make the expected SHA1 fallback explicit in our code, which also addresses an upcoming issue where we are going to remove the SHA1 fallback for `the_hash_algo`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- remote-curl.c | 19 ++++++++++++++++++- t/t5550-http-fetch-dumb.sh | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/remote-curl.c b/remote-curl.c index 0b6d7815fd..004b707fdf 100644 --- a/remote-curl.c +++ b/remote-curl.c @@ -266,12 +266,23 @@ static struct ref *parse_git_refs(struct discovery *heads, int for_push) return list; } +/* + * Try to detect the hash algorithm used by the remote repository when using + * the dumb HTTP transport. As dumb transports cannot tell us the object hash + * directly have to derive it from the advertised ref lengths. + */ static const struct git_hash_algo *detect_hash_algo(struct discovery *heads) { const char *p = memchr(heads->buf, '\t', heads->len); int algo; + + /* + * In case the remote has no refs we have no way to reliably determine + * the object hash used by that repository. In that case we simply fall + * back to SHA1, which may or may not be correct. + */ if (!p) - return the_hash_algo; + return &hash_algos[GIT_HASH_SHA1]; algo = hash_algo_by_length((p - heads->buf) / 2); if (algo == GIT_HASH_UNKNOWN) @@ -295,6 +306,12 @@ static struct ref *parse_info_refs(struct discovery *heads) "is this a git repository?", transport_anonymize_url(url.buf)); + /* + * Set the repository's hash algo to whatever we have just detected. + * This ensures that we can correctly parse the remote references. + */ + repo_set_hash_algo(the_repository, hash_algo_by_ptr(options.hash_algo)); + data = heads->buf; start = NULL; mid = data; diff --git a/t/t5550-http-fetch-dumb.sh b/t/t5550-http-fetch-dumb.sh index 4c3b32785d..5f16cbc58d 100755 --- a/t/t5550-http-fetch-dumb.sh +++ b/t/t5550-http-fetch-dumb.sh @@ -55,6 +55,21 @@ test_expect_success 'list refs from outside any repository' ' test_cmp expect actual ' + +test_expect_success 'list detached HEAD from outside any repository' ' + git clone --mirror "$HTTPD_DOCUMENT_ROOT_PATH/repo.git" \ + "$HTTPD_DOCUMENT_ROOT_PATH/repo-detached.git" && + git -C "$HTTPD_DOCUMENT_ROOT_PATH/repo-detached.git" \ + update-ref --no-deref HEAD refs/heads/main && + git -C "$HTTPD_DOCUMENT_ROOT_PATH/repo-detached.git" update-server-info && + cat >expect <<-EOF && + $(git rev-parse main) HEAD + $(git rev-parse main) refs/heads/main + EOF + nongit git ls-remote "$HTTPD_URL/dumb/repo-detached.git" >actual && + test_cmp expect actual +' + test_expect_success 'create password-protected repository' ' mkdir -p "$HTTPD_DOCUMENT_ROOT_PATH/auth/dumb/" && cp -Rf "$HTTPD_DOCUMENT_ROOT_PATH/repo.git" \ From 07658e9ce5b156876eebf81f39e0672a5347eae5 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:15 +0200 Subject: [PATCH 07/13] builtin/rev-parse: allow shortening to more than 40 hex characters The `--short=` option for git-rev-parse(1) allows the user to specify to how many characters object IDs should be shortened to. The option is broken though for SHA256 repositories because we set the maximum allowed hash size to `the_hash_algo->hexsz` before we have even set up the repo. Consequently, `the_hash_algo` will always be SHA1 and thus we truncate every hash after at most 40 characters. Fix this by accessing `the_hash_algo` only after we have set up the repo. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/rev-parse.c | 5 ++--- t/t1500-rev-parse.sh | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/builtin/rev-parse.c b/builtin/rev-parse.c index af79538632..7d10ee33c4 100644 --- a/builtin/rev-parse.c +++ b/builtin/rev-parse.c @@ -687,7 +687,6 @@ int cmd_rev_parse(int argc, const char **argv, const char *prefix) const char *name = NULL; struct object_context unused; struct strbuf buf = STRBUF_INIT; - const int hexsz = the_hash_algo->hexsz; int seen_end_of_options = 0; enum format_type format = FORMAT_DEFAULT; @@ -863,8 +862,8 @@ int cmd_rev_parse(int argc, const char **argv, const char *prefix) abbrev = strtoul(arg, NULL, 10); if (abbrev < MINIMUM_ABBREV) abbrev = MINIMUM_ABBREV; - else if (hexsz <= abbrev) - abbrev = hexsz; + else if ((int)the_hash_algo->hexsz <= abbrev) + abbrev = the_hash_algo->hexsz; continue; } if (!strcmp(arg, "--sq")) { diff --git a/t/t1500-rev-parse.sh b/t/t1500-rev-parse.sh index a669e592f1..30c31918fd 100755 --- a/t/t1500-rev-parse.sh +++ b/t/t1500-rev-parse.sh @@ -304,4 +304,10 @@ test_expect_success 'rev-parse --bisect includes bad, excludes good' ' test_cmp expect actual ' +test_expect_success '--short= truncates to the actual hash length' ' + git rev-parse HEAD >expect && + git rev-parse --short=100 HEAD >actual && + test_cmp expect actual +' + test_done From ce992ce29a2ebed8daa1f46e712ba38b79b53062 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:20 +0200 Subject: [PATCH 08/13] builtin/blame: don't access potentially unitialized `the_hash_algo` We access `the_hash_algo` in git-blame(1) before we have executed `parse_options_start()`, which may not be properly set up in case we have no repository. This is fine for most of the part because all the call paths that lead to it (git-blame(1), git-annotate(1) as well as git-pick-axe(1)) specify `RUN_SETUP` and thus require a repository. There is one exception though, namely when passing `-h` to print the help. Here we will access `the_hash_algo` even if there is no repo. This works fine right now because `the_hash_algo` gets sets up to point to the SHA1 algorithm via `initialize_repository()`. But we're about to stop doing this, and thus the code would lead to a `NULL` pointer exception. Prepare the code for this and only access `the_hash_algo` after we are sure that there is a proper repository. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/blame.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/builtin/blame.c b/builtin/blame.c index 9aa74680a3..e325825936 100644 --- a/builtin/blame.c +++ b/builtin/blame.c @@ -915,7 +915,6 @@ int cmd_blame(int argc, const char **argv, const char *prefix) struct range_set ranges; unsigned int range_i; long anchor; - const int hexsz = the_hash_algo->hexsz; long num_lines = 0; const char *str_usage = cmd_is_annotate ? annotate_usage : blame_usage; const char **opt_usage = cmd_is_annotate ? annotate_opt_usage : blame_opt_usage; @@ -973,11 +972,11 @@ int cmd_blame(int argc, const char **argv, const char *prefix) } else if (show_progress < 0) show_progress = isatty(2); - if (0 < abbrev && abbrev < hexsz) + if (0 < abbrev && abbrev < (int)the_hash_algo->hexsz) /* one more abbrev length is needed for the boundary commit */ abbrev++; else if (!abbrev) - abbrev = hexsz; + abbrev = the_hash_algo->hexsz; if (revs_file && read_ancestry(revs_file)) die_errno("reading graft file '%s' failed", revs_file); From 332b56b7623c641dbf2f7712e338c0ca2dd423be Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:24 +0200 Subject: [PATCH 09/13] builtin/bundle: abort "verify" early when there is no repository Verifying a bundle requires us to have a repository. This is encoded in `verify_bundle()`, which will return an error if there is no repository. We call `open_bundle()` before we call `verify_bundle()` though, which already performs some verifications even though we may ultimately abort due to a missing repository. This is problematic because `open_bundle()` already reads the bundle header and verifies that it contains a properly formatted hash. When there is no repository we have no clue what hash function to expect though, so we always end up assuming SHA1 here, which may or may not be correct. Furthermore, we are about to stop initializing `the_hash_algo` when there is no repository, which will lead to segfaults. Check early on whether we have a repository to fix this issue. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/bundle.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/builtin/bundle.c b/builtin/bundle.c index 3ad11dc5d0..d5d41a8f67 100644 --- a/builtin/bundle.c +++ b/builtin/bundle.c @@ -140,6 +140,11 @@ static int cmd_bundle_verify(int argc, const char **argv, const char *prefix) { builtin_bundle_verify_usage, options, &bundle_file); /* bundle internals use argv[1] as further parameters */ + if (!startup_info->have_repository) { + ret = error(_("need a repository to verify a bundle")); + goto cleanup; + } + if ((bundle_fd = open_bundle(bundle_file, &header, &name)) < 0) { ret = 1; goto cleanup; From ab274909d440a43cc03f9f815e9f8d4bba705f1e Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:29 +0200 Subject: [PATCH 10/13] builtin/diff: explicitly set hash algo when there is no repo The git-diff(1) command can be used outside repositories to diff two files with each other. But even if there is no repository we will end up hashing the files that we are diffing so that we can print the "index" line: ``` diff --git a/a b/b index 7898192..6178079 100644 --- a/a +++ b/b @@ -1 +1 @@ -a +b ``` We implicitly use SHA1 to calculate the hash here, which is because `the_repository` gets initialized with SHA1 during the startup routine. We are about to stop doing this though such that `the_repository` only ever has a hash function when it was properly initialized via a repo's configuration. To give full control to our users, we would ideally add a new switch to git-diff(1) that allows them to specify the hash function when executed outside of a repository. But for now, we only convert the code to make this explicit such that we can stop setting the default hash algorithm for `the_repository`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/diff.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/builtin/diff.c b/builtin/diff.c index efc37483b3..9b6cdabe15 100644 --- a/builtin/diff.c +++ b/builtin/diff.c @@ -465,6 +465,15 @@ int cmd_diff(int argc, const char **argv, const char *prefix) no_index = DIFF_NO_INDEX_IMPLICIT; } + /* + * When operating outside of a Git repository we need to have a hash + * algorithm at hand so that we can generate the blob hashes. We + * default to SHA1 here, but may eventually want to change this to be + * configurable via a command line option. + */ + if (nongit) + repo_set_hash_algo(the_repository, GIT_HASH_SHA1); + init_diff_ui_defaults(); git_config(git_diff_ui_config, NULL); prefix = precompose_argv_prefix(argc, argv, prefix); From 373bfa6077e76d0b4e8078bc216114dffe3ba430 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:35 +0200 Subject: [PATCH 11/13] builtin/shortlog: don't set up revisions without repo It is possible to run git-shortlog(1) outside of a repository by passing it output from git-log(1) via standard input. Obviously, as there is no repository in that context, it is thus unsupported to pass any revisions as arguments. Regardless of that we still end up calling `setup_revisions()`. While that works alright, it is somewhat strange. Furthermore, this is about to cause problems when we unset the default object hash. Refactor the code to only call `setup_revisions()` when we have a repository. This is safe to do as we already verify that there are no arguments when running outside of a repository anyway. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/shortlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/shortlog.c b/builtin/shortlog.c index 3c7cd2d6ef..d4daf31e22 100644 --- a/builtin/shortlog.c +++ b/builtin/shortlog.c @@ -435,7 +435,7 @@ int cmd_shortlog(int argc, const char **argv, const char *prefix) usage_with_options(shortlog_usage, options); } - if (setup_revisions(argc, argv, &rev, NULL) != 1) { + if (!nongit && setup_revisions(argc, argv, &rev, NULL) != 1) { error(_("unrecognized argument: %s"), argv[1]); usage_with_options(shortlog_usage, options); } From 781ba69d8bc1b0733eb502a2c247d5aebc2ef66d Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:40 +0200 Subject: [PATCH 12/13] oss-fuzz/commit-graph: set up hash algorithm Our fuzzing setups don't work in a proper repository, but only use the in-memory configured `the_repository`. Consequently, we never go through the full repository setup procedures and thus do not set up the hash algo used by the repository. The commit-graph fuzzer does rely on a properly initialized hash algo though. Initialize it explicitly. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- oss-fuzz/fuzz-commit-graph.c | 1 + 1 file changed, 1 insertion(+) diff --git a/oss-fuzz/fuzz-commit-graph.c b/oss-fuzz/fuzz-commit-graph.c index fe15e2c225..75e668a057 100644 --- a/oss-fuzz/fuzz-commit-graph.c +++ b/oss-fuzz/fuzz-commit-graph.c @@ -19,6 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) * touching the disk to keep the individual fuzz-test cases as fast as * possible. */ + repo_set_hash_algo(the_repository, GIT_HASH_SHA1); the_repository->settings.commit_graph_generation_version = 2; the_repository->settings.commit_graph_read_changed_paths = 1; g = parse_commit_graph(&the_repository->settings, (void *)data, size); From c8aed5e8dadf913e041cde72d704aa91f378b71b Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 7 May 2024 06:53:44 +0200 Subject: [PATCH 13/13] repository: stop setting SHA1 as the default object hash During the startup of Git, we call `initialize_the_repository()` to set up `the_repository` as well as `the_index`. Part of this setup is also to set the default object hash of the repository to SHA1. This has the effect that `the_hash_algo` is getting initialized to SHA1, as well. This default hash algorithm eventually gets overridden by most Git commands via `setup_git_directory()`, which also detects the actual hash algorithm used by the repository. There are some commands though that don't access a repository at all, or at a later point only, and thus retain the default hash function for some amount of time. As some of the the preceding commits demonstrate, this can lead to subtle issues when we access `the_hash_algo` when no repository has been set up. Address this issue by dropping the set up of the default hash algorithm completely. The effect of this is that `the_hash_algo` will map to a `NULL` pointer and thus cause Git to crash when something tries to access the hash algorithm without it being properly initialized. It thus forces all Git commands to explicitly set up the hash algorithm in case there is no repository. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- repository.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/repository.c b/repository.c index 2118f563e3..15c10015b0 100644 --- a/repository.c +++ b/repository.c @@ -26,26 +26,6 @@ void initialize_repository(struct repository *repo) repo->parsed_objects = parsed_object_pool_new(); ALLOC_ARRAY(repo->index, 1); index_state_init(repo->index, repo); - - /* - * Unfortunately, we need to keep this hack around for the time being: - * - * - Not setting up the hash algorithm for `the_repository` leads to - * crashes because `the_hash_algo` is a macro that expands to - * `the_repository->hash_algo`. So if Git commands try to access - * `the_hash_algo` without a Git directory we crash. - * - * - Setting up the hash algorithm to be SHA1 by default breaks other - * commands when running with SHA256. - * - * This is another point in case why having global state is a bad idea. - * Eventually, we should remove this hack and stop setting the hash - * algorithm in this function altogether. Instead, it should only ever - * be set via our repository setup procedures. But that requires more - * work. - */ - if (repo == the_repository) - repo_set_hash_algo(repo, GIT_HASH_SHA1); } static void expand_base_dir(char **out, const char *in,