From 4e2443b1813dded87c9cc1138f22af73748022b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 28 Jun 2019 01:39:04 +0200 Subject: [PATCH 01/18] log tests: test regex backends in "--encode=" tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improve the tests added in 04deccda11 ("log: re-encode commit messages before grepping", 2013-02-11) to test the regex backends. Those tests never worked as advertised, due to the is_fixed() optimization in grep.c (which was in place at the time), and the needle in the tests being a fixed string. We'd thus always use the "fixed" backend during the tests, which would use the kwset() backend. This backend liberally accepts any garbage input, so invalid encodings would be silently accepted. In a follow-up commit we'll fix this bug, this test just demonstrates the existing issue. In practice this issue happened on Windows, see [1], but due to the structure of the existing tests & how liberal the kwset code is about garbage we missed this. Cover this blind spot by testing all our regex engines. The PCRE backend will spot these invalid encodings. It's possible that this test breaks the "basic" and "extended" backends on some systems that are more anal than glibc about the encoding of locale issues with POSIX functions that I can remember, but PCRE is more careful about the validation. 1. https://public-inbox.org/git/nycvar.QRO.7.76.6.1906271113090.44@tvgsbejvaqbjf.bet/ Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4210-log-i18n.sh | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/t/t4210-log-i18n.sh b/t/t4210-log-i18n.sh index 7c519436ef..86d22c1d4c 100755 --- a/t/t4210-log-i18n.sh +++ b/t/t4210-log-i18n.sh @@ -1,12 +1,15 @@ #!/bin/sh test_description='test log with i18n features' -. ./test-lib.sh +. ./lib-gettext.sh # two forms of é utf8_e=$(printf '\303\251') latin1_e=$(printf '\351') +# invalid UTF-8 +invalid_e=$(printf '\303\50)') # ")" at end to close opening "(" + test_expect_success 'create commits in different encodings' ' test_tick && cat >msg <<-EOF && @@ -53,4 +56,40 @@ test_expect_success 'log --grep does not find non-reencoded values (latin1)' ' test_must_be_empty actual ' +for engine in fixed basic extended perl +do + prereq= + result=success + if test $engine = "perl" + then + result=failure + prereq="PCRE" + else + prereq="" + fi + force_regex= + if test $engine != "fixed" + then + force_regex=.* + fi + test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " + cat >expect <<-\EOF && + latin1 + utf8 + EOF + LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$latin1_e\" >actual && + test_cmp expect actual + " + + test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " + LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$utf8_e\" >actual && + test_must_be_empty actual + " + + test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" " + LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$invalid_e\" >actual && + test_must_be_empty actual + " +done + test_done From 44570188a0e324048decf06b845d34c45b08a4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 28 Jun 2019 01:39:05 +0200 Subject: [PATCH 02/18] grep: don't use PCRE2?_UTF8 with "log --encoding=" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a bug introduced in 18547aacf5 ("grep/pcre: support utf-8", 2016-06-25) that was missed due to a blindspot in our tests, as discussed in the previous commit. I then blindly copied the same bug in 94da9193a6 ("grep: add support for PCRE v2", 2017-06-01) when adding the PCRE v2 code. We should not tell PCRE that we're processing UTF-8 just because we're dealing with non-ASCII. In the case of e.g. "log --encoding=<...>" under is_utf8_locale() the haystack might be in ISO-8859-1, and the needle might be in a non-UTF-8 encoding. Maybe we should be more strict here and die earlier? Should we also be converting the needle to the encoding in question, and failing if it's not a string that's valid in that encoding? Maybe. But for now matching this as non-UTF8 at least has some hope of producing sensible results, since we know that our default heuristic of assuming the text to be matched is in the user locale encoding isn't true when we've explicitly encoded it to be in a different encoding. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 8 ++++---- grep.h | 1 + revision.c | 3 +++ t/t4210-log-i18n.sh | 6 ++---- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/grep.c b/grep.c index f7c3a5803e..1de4ab49c0 100644 --- a/grep.c +++ b/grep.c @@ -388,11 +388,11 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) int options = PCRE_MULTILINE; if (opt->ignore_case) { - if (has_non_ascii(p->pattern)) + if (!opt->ignore_locale && has_non_ascii(p->pattern)) p->pcre1_tables = pcre_maketables(); options |= PCRE_CASELESS; } - if (is_utf8_locale() && has_non_ascii(p->pattern)) + if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern)) options |= PCRE_UTF8; p->pcre1_regexp = pcre_compile(p->pattern, options, &error, &erroffset, @@ -498,14 +498,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt p->pcre2_compile_context = NULL; if (opt->ignore_case) { - if (has_non_ascii(p->pattern)) { + if (!opt->ignore_locale && has_non_ascii(p->pattern)) { character_tables = pcre2_maketables(NULL); p->pcre2_compile_context = pcre2_compile_context_create(NULL); pcre2_set_character_tables(p->pcre2_compile_context, character_tables); } options |= PCRE2_CASELESS; } - if (is_utf8_locale() && has_non_ascii(p->pattern)) + if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern)) options |= PCRE2_UTF; p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern, diff --git a/grep.h b/grep.h index 1875880f37..4bb8a79d93 100644 --- a/grep.h +++ b/grep.h @@ -173,6 +173,7 @@ struct grep_opt { int funcbody; int extended_regexp_option; int pattern_type_option; + int ignore_locale; char colors[NR_GREP_COLORS][COLOR_MAXLEN]; unsigned pre_context; unsigned post_context; diff --git a/revision.c b/revision.c index 621feb9df7..a842fb158a 100644 --- a/revision.c +++ b/revision.c @@ -28,6 +28,7 @@ #include "commit-graph.h" #include "prio-queue.h" #include "hashmap.h" +#include "utf8.h" volatile show_early_output_fn_t show_early_output; @@ -2655,6 +2656,8 @@ int setup_revisions(int argc, const char **argv, struct rev_info *revs, struct s grep_commit_pattern_type(GREP_PATTERN_TYPE_UNSPECIFIED, &revs->grep_filter); + if (!is_encoding_utf8(get_log_output_encoding())) + revs->grep_filter.ignore_locale = 1; compile_grep_patterns(&revs->grep_filter); if (revs->reverse && revs->reflog_info) diff --git a/t/t4210-log-i18n.sh b/t/t4210-log-i18n.sh index 86d22c1d4c..515bcb7ce1 100755 --- a/t/t4210-log-i18n.sh +++ b/t/t4210-log-i18n.sh @@ -59,10 +59,8 @@ test_expect_success 'log --grep does not find non-reencoded values (latin1)' ' for engine in fixed basic extended perl do prereq= - result=success if test $engine = "perl" then - result=failure prereq="PCRE" else prereq="" @@ -72,7 +70,7 @@ do then force_regex=.* fi - test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " + test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " cat >expect <<-\EOF && latin1 utf8 @@ -86,7 +84,7 @@ do test_must_be_empty actual " - test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" " + test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" " LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$invalid_e\" >actual && test_must_be_empty actual " From b14cf112e2c3d86de931276c2c778004a168db65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:53 +0200 Subject: [PATCH 03/18] t4210: skip more command-line encoding tests on MinGW MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In 5212f91deb ("t4210: skip command-line encoding tests on mingw", 2014-07-17) the positive tests in this file were skipped. That left the negative tests that don't produce a match. An upcoming change to migrate the "fixed" backend of grep to PCRE v2 will cause these "log" commands to produce an error instead on MinGW. This is because the command-line on that platform implicitly has its encoding changed before being passed to git. See [1]. 1. https://public-inbox.org/git/nycvar.QRO.7.76.6.1907011515150.44@tvgsbejvaqbjf.bet/ Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t4210-log-i18n.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/t/t4210-log-i18n.sh b/t/t4210-log-i18n.sh index 515bcb7ce1..6e61f57f09 100755 --- a/t/t4210-log-i18n.sh +++ b/t/t4210-log-i18n.sh @@ -51,7 +51,7 @@ test_expect_success !MINGW 'log --grep does not find non-reencoded values (utf8) test_must_be_empty actual ' -test_expect_success 'log --grep does not find non-reencoded values (latin1)' ' +test_expect_success !MINGW 'log --grep does not find non-reencoded values (latin1)' ' git log --encoding=ISO-8859-1 --format=%s --grep=$utf8_e >actual && test_must_be_empty actual ' @@ -70,7 +70,7 @@ do then force_regex=.* fi - test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " + test_expect_success !MINGW,GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " cat >expect <<-\EOF && latin1 utf8 @@ -79,12 +79,12 @@ do test_cmp expect actual " - test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " + test_expect_success !MINGW,GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" " LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$utf8_e\" >actual && test_must_be_empty actual " - test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" " + test_expect_success !MINGW,GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" " LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$invalid_e\" >actual && test_must_be_empty actual " From f463beb805638830e2a6b16359f94d8afca289ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:54 +0200 Subject: [PATCH 04/18] grep: inline the return value of a function call used only once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since e944d9d932 ("grep: rewrite an if/else condition to avoid duplicate expression", 2016-06-25) the "ascii_only" variable has only been used once in compile_regexp(), let's just inline it there. This makes the code easier to read, and might make it marginally faster depending on compiler optimizations. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/grep.c b/grep.c index 1de4ab49c0..4e8d0645a8 100644 --- a/grep.c +++ b/grep.c @@ -650,13 +650,11 @@ static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt) static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { - int ascii_only; int err; int regflags = REG_NEWLINE; p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; - ascii_only = !has_non_ascii(p->pattern); /* * Even when -F (fixed) asks us to do a non-regexp search, we @@ -673,7 +671,7 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) if (opt->fixed || has_null(p->pattern, p->patternlen) || is_fixed(p->pattern, p->patternlen)) - p->fixed = !p->ignore_case || ascii_only; + p->fixed = !p->ignore_case || !has_non_ascii(p->pattern); if (p->fixed) { p->kws = kwsalloc(p->ignore_case ? tolower_trans_tbl : NULL); From 471dac5d2ceec4ccf7ad149402dfcb66ac066ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:55 +0200 Subject: [PATCH 05/18] grep tests: move "grep binary" alongside the rest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the "grep binary" test case added in aca20dd558 ("grep: add test script for binary file handling", 2010-05-22) so that it lives alongside the rest of the "grep" tests in t781*. This would have left a gap in the t/700* namespace, so move a "filter-branch" test down, leaving the "t7010-setup.sh" test as the next one after that. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- ...ilter-branch-null-sha1.sh => t7008-filter-branch-null-sha1.sh} | 0 t/{t7008-grep-binary.sh => t7815-grep-binary.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename t/{t7009-filter-branch-null-sha1.sh => t7008-filter-branch-null-sha1.sh} (100%) rename t/{t7008-grep-binary.sh => t7815-grep-binary.sh} (100%) diff --git a/t/t7009-filter-branch-null-sha1.sh b/t/t7008-filter-branch-null-sha1.sh similarity index 100% rename from t/t7009-filter-branch-null-sha1.sh rename to t/t7008-filter-branch-null-sha1.sh diff --git a/t/t7008-grep-binary.sh b/t/t7815-grep-binary.sh similarity index 100% rename from t/t7008-grep-binary.sh rename to t/t7815-grep-binary.sh From d316af059d67273f381048cdf538d2e0667b6485 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:56 +0200 Subject: [PATCH 06/18] grep tests: move binary pattern tests into their own file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the tests for "-f " where "" contains a NUL byte pattern into their own file. I added most of these tests in 966be95549 ("grep: add tests to fix blind spots with \0 patterns", 2017-05-20). Whether a regex engine supports matching binary content is very different from whether it matches binary patterns. Since 2f8952250a ("regex: add regexec_buf() that can work on a non NUL-terminated string", 2016-09-21) we've required REG_STARTEND of our regex engines so we can match binary content, but only the PCRE v2 engine can sensibly match binary patterns. Since 9eceddeec6 ("Use kwset in grep", 2011-08-21) we've been punting patterns containing NUL-byte and considering them fixed, except in cases where "--ignore-case" is provided and they're non-ASCII, see 5c1ebcca4d ("grep/icase: avoid kwsset on literal non-ascii strings", 2016-06-25). Subsequent commits will change this behavior. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t7815-grep-binary.sh | 101 ----------------------------- t/t7816-grep-binary-pattern.sh | 114 +++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 101 deletions(-) create mode 100755 t/t7816-grep-binary-pattern.sh diff --git a/t/t7815-grep-binary.sh b/t/t7815-grep-binary.sh index 2d87c49b75..90ebb64f46 100755 --- a/t/t7815-grep-binary.sh +++ b/t/t7815-grep-binary.sh @@ -4,41 +4,6 @@ test_description='git grep in binary files' . ./test-lib.sh -nul_match () { - matches=$1 - flags=$2 - pattern=$3 - pattern_human=$(echo "$pattern" | sed 's/Q//g') - - if test "$matches" = 1 - then - test_expect_success "git grep -f f $flags '$pattern_human' a" " - printf '$pattern' | q_to_nul >f && - git grep -f f $flags a - " - elif test "$matches" = 0 - then - test_expect_success "git grep -f f $flags '$pattern_human' a" " - printf '$pattern' | q_to_nul >f && - test_must_fail git grep -f f $flags a - " - elif test "$matches" = T1 - then - test_expect_failure "git grep -f f $flags '$pattern_human' a" " - printf '$pattern' | q_to_nul >f && - git grep -f f $flags a - " - elif test "$matches" = T0 - then - test_expect_failure "git grep -f f $flags '$pattern_human' a" " - printf '$pattern' | q_to_nul >f && - test_must_fail git grep -f f $flags a - " - else - test_expect_success "PANIC: Test framework error. Unknown matches value $matches" 'false' - fi -} - test_expect_success 'setup' " echo 'binaryQfileQm[*]cQ*æQð' | q_to_nul >a && git add a && @@ -102,72 +67,6 @@ test_expect_failure 'git grep .fi a' ' git grep .fi a ' -nul_match 1 '-F' 'yQf' -nul_match 0 '-F' 'yQx' -nul_match 1 '-Fi' 'YQf' -nul_match 0 '-Fi' 'YQx' -nul_match 1 '' 'yQf' -nul_match 0 '' 'yQx' -nul_match 1 '' 'æQð' -nul_match 1 '-F' 'eQm[*]c' -nul_match 1 '-Fi' 'EQM[*]C' - -# Regex patterns that would match but shouldn't with -F -nul_match 0 '-F' 'yQ[f]' -nul_match 0 '-F' '[y]Qf' -nul_match 0 '-Fi' 'YQ[F]' -nul_match 0 '-Fi' '[Y]QF' -nul_match 0 '-F' 'æQ[ð]' -nul_match 0 '-F' '[æ]Qð' -nul_match 0 '-Fi' 'ÆQ[Ð]' -nul_match 0 '-Fi' '[Æ]QÐ' - -# kwset is disabled on -i & non-ASCII. No way to match non-ASCII \0 -# patterns case-insensitively. -nul_match T1 '-i' 'ÆQÐ' - -# \0 implicitly disables regexes. This is an undocumented internal -# limitation. -nul_match T1 '' 'yQ[f]' -nul_match T1 '' '[y]Qf' -nul_match T1 '-i' 'YQ[F]' -nul_match T1 '-i' '[Y]Qf' -nul_match T1 '' 'æQ[ð]' -nul_match T1 '' '[æ]Qð' -nul_match T1 '-i' 'ÆQ[Ð]' - -# ... because of \0 implicitly disabling regexes regexes that -# should/shouldn't match don't do the right thing. -nul_match T1 '' 'eQm.*cQ' -nul_match T1 '-i' 'EQM.*cQ' -nul_match T0 '' 'eQm[*]c' -nul_match T0 '-i' 'EQM[*]C' - -# Due to the REG_STARTEND extension when kwset() is disabled on -i & -# non-ASCII the string will be matched in its entirety, but the -# pattern will be cut off at the first \0. -nul_match 0 '-i' 'NOMATCHQð' -nul_match T0 '-i' '[Æ]QNOMATCH' -nul_match T0 '-i' '[æ]QNOMATCH' -# Matches, but for the wrong reasons, just stops at [æ] -nul_match 1 '-i' '[Æ]Qð' -nul_match 1 '-i' '[æ]Qð' - -# Ensure that the matcher doesn't regress to something that stops at -# \0 -nul_match 0 '-F' 'yQ[f]' -nul_match 0 '-Fi' 'YQ[F]' -nul_match 0 '' 'yQNOMATCH' -nul_match 0 '' 'QNOMATCH' -nul_match 0 '-i' 'YQNOMATCH' -nul_match 0 '-i' 'QNOMATCH' -nul_match 0 '-F' 'æQ[ð]' -nul_match 0 '-Fi' 'ÆQ[Ð]' -nul_match 0 '' 'yQNÓMATCH' -nul_match 0 '' 'QNÓMATCH' -nul_match 0 '-i' 'YQNÓMATCH' -nul_match 0 '-i' 'QNÓMATCH' - test_expect_success 'grep respects binary diff attribute' ' echo text >t && git add t && diff --git a/t/t7816-grep-binary-pattern.sh b/t/t7816-grep-binary-pattern.sh new file mode 100755 index 0000000000..4060dbd679 --- /dev/null +++ b/t/t7816-grep-binary-pattern.sh @@ -0,0 +1,114 @@ +#!/bin/sh + +test_description='git grep with a binary pattern files' + +. ./test-lib.sh + +nul_match () { + matches=$1 + flags=$2 + pattern=$3 + pattern_human=$(echo "$pattern" | sed 's/Q//g') + + if test "$matches" = 1 + then + test_expect_success "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + git grep -f f $flags a + " + elif test "$matches" = 0 + then + test_expect_success "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + test_must_fail git grep -f f $flags a + " + elif test "$matches" = T1 + then + test_expect_failure "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + git grep -f f $flags a + " + elif test "$matches" = T0 + then + test_expect_failure "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + test_must_fail git grep -f f $flags a + " + else + test_expect_success "PANIC: Test framework error. Unknown matches value $matches" 'false' + fi +} + +test_expect_success 'setup' " + echo 'binaryQfileQm[*]cQ*æQð' | q_to_nul >a && + git add a && + git commit -m. +" + +nul_match 1 '-F' 'yQf' +nul_match 0 '-F' 'yQx' +nul_match 1 '-Fi' 'YQf' +nul_match 0 '-Fi' 'YQx' +nul_match 1 '' 'yQf' +nul_match 0 '' 'yQx' +nul_match 1 '' 'æQð' +nul_match 1 '-F' 'eQm[*]c' +nul_match 1 '-Fi' 'EQM[*]C' + +# Regex patterns that would match but shouldn't with -F +nul_match 0 '-F' 'yQ[f]' +nul_match 0 '-F' '[y]Qf' +nul_match 0 '-Fi' 'YQ[F]' +nul_match 0 '-Fi' '[Y]QF' +nul_match 0 '-F' 'æQ[ð]' +nul_match 0 '-F' '[æ]Qð' +nul_match 0 '-Fi' 'ÆQ[Ð]' +nul_match 0 '-Fi' '[Æ]QÐ' + +# kwset is disabled on -i & non-ASCII. No way to match non-ASCII \0 +# patterns case-insensitively. +nul_match T1 '-i' 'ÆQÐ' + +# \0 implicitly disables regexes. This is an undocumented internal +# limitation. +nul_match T1 '' 'yQ[f]' +nul_match T1 '' '[y]Qf' +nul_match T1 '-i' 'YQ[F]' +nul_match T1 '-i' '[Y]Qf' +nul_match T1 '' 'æQ[ð]' +nul_match T1 '' '[æ]Qð' +nul_match T1 '-i' 'ÆQ[Ð]' + +# ... because of \0 implicitly disabling regexes regexes that +# should/shouldn't match don't do the right thing. +nul_match T1 '' 'eQm.*cQ' +nul_match T1 '-i' 'EQM.*cQ' +nul_match T0 '' 'eQm[*]c' +nul_match T0 '-i' 'EQM[*]C' + +# Due to the REG_STARTEND extension when kwset() is disabled on -i & +# non-ASCII the string will be matched in its entirety, but the +# pattern will be cut off at the first \0. +nul_match 0 '-i' 'NOMATCHQð' +nul_match T0 '-i' '[Æ]QNOMATCH' +nul_match T0 '-i' '[æ]QNOMATCH' +# Matches, but for the wrong reasons, just stops at [æ] +nul_match 1 '-i' '[Æ]Qð' +nul_match 1 '-i' '[æ]Qð' + +# Ensure that the matcher doesn't regress to something that stops at +# \0 +nul_match 0 '-F' 'yQ[f]' +nul_match 0 '-Fi' 'YQ[F]' +nul_match 0 '' 'yQNOMATCH' +nul_match 0 '' 'QNOMATCH' +nul_match 0 '-i' 'YQNOMATCH' +nul_match 0 '-i' 'QNOMATCH' +nul_match 0 '-F' 'æQ[ð]' +nul_match 0 '-Fi' 'ÆQ[Ð]' +nul_match 0 '' 'yQNÓMATCH' +nul_match 0 '' 'QNÓMATCH' +nul_match 0 '-i' 'YQNÓMATCH' +nul_match 0 '-i' 'QNÓMATCH' + +test_done From 25754125cef278c7e9492fbd6dc4a28319b01f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:57 +0200 Subject: [PATCH 07/18] grep: make the behavior for NUL-byte in patterns sane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The behavior of "grep" when patterns contained a NUL-byte has always been haphazard, and has served the vagaries of the implementation more than anything else. A pattern containing a NUL-byte can only be provided via "-f ". Since pickaxe (log search) has no such flag the NUL-byte in patterns has only ever been supported by "grep" (and not "log --grep"). Since 9eceddeec6 ("Use kwset in grep", 2011-08-21) patterns containing "\0" were considered fixed. In 966be95549 ("grep: add tests to fix blind spots with \0 patterns", 2017-05-20) I added tests for this behavior. Change the behavior to do the obvious thing, i.e. don't silently discard a regex pattern and make it implicitly fixed just because they contain a NUL-byte. Instead die if the backend in question can't handle them, e.g. --basic-regexp is combined with such a pattern. This is desired because from a user's point of view it's the obvious thing to do. Whether we support BRE/ERE/Perl syntax is different from whether our implementation is limited by C-strings. These patterns are obscure enough that I think this behavior change is OK, especially since we never documented the old behavior. Doing this also makes it easier to replace the kwset backend with something else, since we'll no longer strictly need it for anything we can't easily use another fixed-string backend for. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- Documentation/git-grep.txt | 17 ++++ grep.c | 23 ++--- t/t7816-grep-binary-pattern.sh | 159 ++++++++++++++++++--------------- 3 files changed, 110 insertions(+), 89 deletions(-) diff --git a/Documentation/git-grep.txt b/Documentation/git-grep.txt index 2d27969057..c89fb569e3 100644 --- a/Documentation/git-grep.txt +++ b/Documentation/git-grep.txt @@ -271,6 +271,23 @@ providing this option will cause it to die. -f :: Read patterns from , one per line. ++ +Passing the pattern via allows for providing a search pattern +containing a \0. ++ +Not all pattern types support patterns containing \0. Git will error +out if a given pattern type can't support such a pattern. The +`--perl-regexp` pattern type when compiled against the PCRE v2 backend +has the widest support for these types of patterns. ++ +In versions of Git before 2.23.0 patterns containing \0 would be +silently considered fixed. This was never documented, there were also +odd and undocumented interactions between e.g. non-ASCII patterns +containing \0 and `--ignore-case`. ++ +In future versions we may learn to support patterns containing \0 for +more search backends, until then we'll die when the pattern type in +question doesn't support them. -e:: The next parameter is the pattern. This option has to be diff --git a/grep.c b/grep.c index 4e8d0645a8..d6603bc950 100644 --- a/grep.c +++ b/grep.c @@ -368,18 +368,6 @@ static int is_fixed(const char *s, size_t len) return 1; } -static int has_null(const char *s, size_t len) -{ - /* - * regcomp cannot accept patterns with NULs so when using it - * we consider any pattern containing a NUL fixed. - */ - if (memchr(s, 0, len)) - return 1; - - return 0; -} - #ifdef USE_LIBPCRE1 static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) { @@ -668,9 +656,7 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) * simple string match using kws. p->fixed tells us if we * want to use kws. */ - if (opt->fixed || - has_null(p->pattern, p->patternlen) || - is_fixed(p->pattern, p->patternlen)) + if (opt->fixed || is_fixed(p->pattern, p->patternlen)) p->fixed = !p->ignore_case || !has_non_ascii(p->pattern); if (p->fixed) { @@ -678,7 +664,12 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) kwsincr(p->kws, p->pattern, p->patternlen); kwsprep(p->kws); return; - } else if (opt->fixed) { + } + + if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2) + die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); + + if (opt->fixed) { /* * We come here when the pattern has the non-ascii * characters we cannot case-fold, and asked to diff --git a/t/t7816-grep-binary-pattern.sh b/t/t7816-grep-binary-pattern.sh index 4060dbd679..9e09bd5d6a 100755 --- a/t/t7816-grep-binary-pattern.sh +++ b/t/t7816-grep-binary-pattern.sh @@ -2,113 +2,126 @@ test_description='git grep with a binary pattern files' -. ./test-lib.sh +. ./lib-gettext.sh -nul_match () { +nul_match_internal () { matches=$1 - flags=$2 - pattern=$3 + prereqs=$2 + lc_all=$3 + extra_flags=$4 + flags=$5 + pattern=$6 pattern_human=$(echo "$pattern" | sed 's/Q//g') if test "$matches" = 1 then - test_expect_success "git grep -f f $flags '$pattern_human' a" " + test_expect_success $prereqs "LC_ALL='$lc_all' git grep $extra_flags -f f $flags '$pattern_human' a" " printf '$pattern' | q_to_nul >f && - git grep -f f $flags a + LC_ALL='$lc_all' git grep $extra_flags -f f $flags a " elif test "$matches" = 0 then - test_expect_success "git grep -f f $flags '$pattern_human' a" " + test_expect_success $prereqs "LC_ALL='$lc_all' git grep $extra_flags -f f $flags '$pattern_human' a" " + >stderr && printf '$pattern' | q_to_nul >f && - test_must_fail git grep -f f $flags a + test_must_fail env LC_ALL=\"$lc_all\" git grep $extra_flags -f f $flags a 2>stderr && + test_i18ngrep ! 'This is only supported with -P under PCRE v2' stderr " - elif test "$matches" = T1 + elif test "$matches" = P then - test_expect_failure "git grep -f f $flags '$pattern_human' a" " + test_expect_success $prereqs "error, PCRE v2 only: LC_ALL='$lc_all' git grep -f f $flags '$pattern_human' a" " + >stderr && printf '$pattern' | q_to_nul >f && - git grep -f f $flags a - " - elif test "$matches" = T0 - then - test_expect_failure "git grep -f f $flags '$pattern_human' a" " - printf '$pattern' | q_to_nul >f && - test_must_fail git grep -f f $flags a + test_must_fail env LC_ALL=\"$lc_all\" git grep -f f $flags a 2>stderr && + test_i18ngrep 'This is only supported with -P under PCRE v2' stderr " else test_expect_success "PANIC: Test framework error. Unknown matches value $matches" 'false' fi } +nul_match () { + matches=$1 + matches_pcre2=$2 + matches_pcre2_locale=$3 + flags=$4 + pattern=$5 + pattern_human=$(echo "$pattern" | sed 's/Q//g') + + nul_match_internal "$matches" "" "C" "" "$flags" "$pattern" + nul_match_internal "$matches_pcre2" "LIBPCRE2" "C" "-P" "$flags" "$pattern" + nul_match_internal "$matches_pcre2_locale" "LIBPCRE2,GETTEXT_LOCALE" "$is_IS_locale" "-P" "$flags" "$pattern" +} + test_expect_success 'setup' " echo 'binaryQfileQm[*]cQ*æQð' | q_to_nul >a && git add a && git commit -m. " -nul_match 1 '-F' 'yQf' -nul_match 0 '-F' 'yQx' -nul_match 1 '-Fi' 'YQf' -nul_match 0 '-Fi' 'YQx' -nul_match 1 '' 'yQf' -nul_match 0 '' 'yQx' -nul_match 1 '' 'æQð' -nul_match 1 '-F' 'eQm[*]c' -nul_match 1 '-Fi' 'EQM[*]C' +# Simple fixed-string matching that can use kwset (no -i && non-ASCII) +nul_match 1 1 1 '-F' 'yQf' +nul_match 0 0 0 '-F' 'yQx' +nul_match 1 1 1 '-Fi' 'YQf' +nul_match 0 0 0 '-Fi' 'YQx' +nul_match 1 1 1 '' 'yQf' +nul_match 0 0 0 '' 'yQx' +nul_match 1 1 1 '' 'æQð' +nul_match 1 1 1 '-F' 'eQm[*]c' +nul_match 1 1 1 '-Fi' 'EQM[*]C' # Regex patterns that would match but shouldn't with -F -nul_match 0 '-F' 'yQ[f]' -nul_match 0 '-F' '[y]Qf' -nul_match 0 '-Fi' 'YQ[F]' -nul_match 0 '-Fi' '[Y]QF' -nul_match 0 '-F' 'æQ[ð]' -nul_match 0 '-F' '[æ]Qð' -nul_match 0 '-Fi' 'ÆQ[Ð]' -nul_match 0 '-Fi' '[Æ]QÐ' +nul_match 0 0 0 '-F' 'yQ[f]' +nul_match 0 0 0 '-F' '[y]Qf' +nul_match 0 0 0 '-Fi' 'YQ[F]' +nul_match 0 0 0 '-Fi' '[Y]QF' +nul_match 0 0 0 '-F' 'æQ[ð]' +nul_match 0 0 0 '-F' '[æ]Qð' -# kwset is disabled on -i & non-ASCII. No way to match non-ASCII \0 -# patterns case-insensitively. -nul_match T1 '-i' 'ÆQÐ' +# The -F kwset codepath can't handle -i && non-ASCII... +nul_match P 1 1 '-i' '[æ]Qð' -# \0 implicitly disables regexes. This is an undocumented internal -# limitation. -nul_match T1 '' 'yQ[f]' -nul_match T1 '' '[y]Qf' -nul_match T1 '-i' 'YQ[F]' -nul_match T1 '-i' '[Y]Qf' -nul_match T1 '' 'æQ[ð]' -nul_match T1 '' '[æ]Qð' -nul_match T1 '-i' 'ÆQ[Ð]' +# ...PCRE v2 only matches non-ASCII with -i casefolding under UTF-8 +# semantics +nul_match P P P '-Fi' 'ÆQ[Ð]' +nul_match P 0 1 '-i' 'ÆQ[Ð]' +nul_match P 0 1 '-i' '[Æ]QÐ' +nul_match P 0 1 '-i' '[Æ]Qð' +nul_match P 0 1 '-i' 'ÆQÐ' -# ... because of \0 implicitly disabling regexes regexes that -# should/shouldn't match don't do the right thing. -nul_match T1 '' 'eQm.*cQ' -nul_match T1 '-i' 'EQM.*cQ' -nul_match T0 '' 'eQm[*]c' -nul_match T0 '-i' 'EQM[*]C' +# \0 in regexes can only work with -P & PCRE v2 +nul_match P 1 1 '' 'yQ[f]' +nul_match P 1 1 '' '[y]Qf' +nul_match P 1 1 '-i' 'YQ[F]' +nul_match P 1 1 '-i' '[Y]Qf' +nul_match P 1 1 '' 'æQ[ð]' +nul_match P 1 1 '' '[æ]Qð' +nul_match P 0 1 '-i' 'ÆQ[Ð]' +nul_match P 1 1 '' 'eQm.*cQ' +nul_match P 1 1 '-i' 'EQM.*cQ' +nul_match P 0 0 '' 'eQm[*]c' +nul_match P 0 0 '-i' 'EQM[*]C' -# Due to the REG_STARTEND extension when kwset() is disabled on -i & -# non-ASCII the string will be matched in its entirety, but the -# pattern will be cut off at the first \0. -nul_match 0 '-i' 'NOMATCHQð' -nul_match T0 '-i' '[Æ]QNOMATCH' -nul_match T0 '-i' '[æ]QNOMATCH' -# Matches, but for the wrong reasons, just stops at [æ] -nul_match 1 '-i' '[Æ]Qð' -nul_match 1 '-i' '[æ]Qð' +# Assert that we're using REG_STARTEND and the pattern doesn't match +# just because it's cut off at the first \0. +nul_match 0 0 0 '-i' 'NOMATCHQð' +nul_match P 0 0 '-i' '[Æ]QNOMATCH' +nul_match P 0 0 '-i' '[æ]QNOMATCH' # Ensure that the matcher doesn't regress to something that stops at # \0 -nul_match 0 '-F' 'yQ[f]' -nul_match 0 '-Fi' 'YQ[F]' -nul_match 0 '' 'yQNOMATCH' -nul_match 0 '' 'QNOMATCH' -nul_match 0 '-i' 'YQNOMATCH' -nul_match 0 '-i' 'QNOMATCH' -nul_match 0 '-F' 'æQ[ð]' -nul_match 0 '-Fi' 'ÆQ[Ð]' -nul_match 0 '' 'yQNÓMATCH' -nul_match 0 '' 'QNÓMATCH' -nul_match 0 '-i' 'YQNÓMATCH' -nul_match 0 '-i' 'QNÓMATCH' +nul_match 0 0 0 '-F' 'yQ[f]' +nul_match 0 0 0 '-Fi' 'YQ[F]' +nul_match 0 0 0 '' 'yQNOMATCH' +nul_match 0 0 0 '' 'QNOMATCH' +nul_match 0 0 0 '-i' 'YQNOMATCH' +nul_match 0 0 0 '-i' 'QNOMATCH' +nul_match 0 0 0 '-F' 'æQ[ð]' +nul_match P P P '-Fi' 'ÆQ[Ð]' +nul_match P 0 1 '-i' 'ÆQ[Ð]' +nul_match 0 0 0 '' 'yQNÓMATCH' +nul_match 0 0 0 '' 'QNÓMATCH' +nul_match 0 0 0 '-i' 'YQNÓMATCH' +nul_match 0 0 0 '-i' 'QNÓMATCH' test_done From 45d1f37ccc16ad53303910e150f7fbe36213aad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:58 +0200 Subject: [PATCH 08/18] grep: drop support for \0 in --fixed-strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change "-f " to not support patterns with a NUL-byte in them under --fixed-strings. We'll now only support these under "--perl-regexp" with PCRE v2. A previous change to grep's documentation changed the description of "-f " to be vague enough as to not promise that this would work. By dropping support for this we make it a whole lot easier to move away from the kwset backend, which we'll do in a subsequent change. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 6 +-- t/t7816-grep-binary-pattern.sh | 82 +++++++++++++++++----------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/grep.c b/grep.c index d6603bc950..8d0fff316c 100644 --- a/grep.c +++ b/grep.c @@ -644,6 +644,9 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; + if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2) + die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); + /* * Even when -F (fixed) asks us to do a non-regexp search, we * may not be able to correctly case-fold when -i @@ -666,9 +669,6 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) return; } - if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2) - die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); - if (opt->fixed) { /* * We come here when the pattern has the non-ascii diff --git a/t/t7816-grep-binary-pattern.sh b/t/t7816-grep-binary-pattern.sh index 9e09bd5d6a..60bab291e4 100755 --- a/t/t7816-grep-binary-pattern.sh +++ b/t/t7816-grep-binary-pattern.sh @@ -60,23 +60,23 @@ test_expect_success 'setup' " " # Simple fixed-string matching that can use kwset (no -i && non-ASCII) -nul_match 1 1 1 '-F' 'yQf' -nul_match 0 0 0 '-F' 'yQx' -nul_match 1 1 1 '-Fi' 'YQf' -nul_match 0 0 0 '-Fi' 'YQx' -nul_match 1 1 1 '' 'yQf' -nul_match 0 0 0 '' 'yQx' -nul_match 1 1 1 '' 'æQð' -nul_match 1 1 1 '-F' 'eQm[*]c' -nul_match 1 1 1 '-Fi' 'EQM[*]C' +nul_match P P P '-F' 'yQf' +nul_match P P P '-F' 'yQx' +nul_match P P P '-Fi' 'YQf' +nul_match P P P '-Fi' 'YQx' +nul_match P P 1 '' 'yQf' +nul_match P P 0 '' 'yQx' +nul_match P P 1 '' 'æQð' +nul_match P P P '-F' 'eQm[*]c' +nul_match P P P '-Fi' 'EQM[*]C' # Regex patterns that would match but shouldn't with -F -nul_match 0 0 0 '-F' 'yQ[f]' -nul_match 0 0 0 '-F' '[y]Qf' -nul_match 0 0 0 '-Fi' 'YQ[F]' -nul_match 0 0 0 '-Fi' '[Y]QF' -nul_match 0 0 0 '-F' 'æQ[ð]' -nul_match 0 0 0 '-F' '[æ]Qð' +nul_match P P P '-F' 'yQ[f]' +nul_match P P P '-F' '[y]Qf' +nul_match P P P '-Fi' 'YQ[F]' +nul_match P P P '-Fi' '[Y]QF' +nul_match P P P '-F' 'æQ[ð]' +nul_match P P P '-F' '[æ]Qð' # The -F kwset codepath can't handle -i && non-ASCII... nul_match P 1 1 '-i' '[æ]Qð' @@ -90,38 +90,38 @@ nul_match P 0 1 '-i' '[Æ]Qð' nul_match P 0 1 '-i' 'ÆQÐ' # \0 in regexes can only work with -P & PCRE v2 -nul_match P 1 1 '' 'yQ[f]' -nul_match P 1 1 '' '[y]Qf' -nul_match P 1 1 '-i' 'YQ[F]' -nul_match P 1 1 '-i' '[Y]Qf' -nul_match P 1 1 '' 'æQ[ð]' -nul_match P 1 1 '' '[æ]Qð' -nul_match P 0 1 '-i' 'ÆQ[Ð]' -nul_match P 1 1 '' 'eQm.*cQ' -nul_match P 1 1 '-i' 'EQM.*cQ' -nul_match P 0 0 '' 'eQm[*]c' -nul_match P 0 0 '-i' 'EQM[*]C' +nul_match P P 1 '' 'yQ[f]' +nul_match P P 1 '' '[y]Qf' +nul_match P P 1 '-i' 'YQ[F]' +nul_match P P 1 '-i' '[Y]Qf' +nul_match P P 1 '' 'æQ[ð]' +nul_match P P 1 '' '[æ]Qð' +nul_match P P 1 '-i' 'ÆQ[Ð]' +nul_match P P 1 '' 'eQm.*cQ' +nul_match P P 1 '-i' 'EQM.*cQ' +nul_match P P 0 '' 'eQm[*]c' +nul_match P P 0 '-i' 'EQM[*]C' # Assert that we're using REG_STARTEND and the pattern doesn't match # just because it's cut off at the first \0. -nul_match 0 0 0 '-i' 'NOMATCHQð' -nul_match P 0 0 '-i' '[Æ]QNOMATCH' -nul_match P 0 0 '-i' '[æ]QNOMATCH' +nul_match P P 0 '-i' 'NOMATCHQð' +nul_match P P 0 '-i' '[Æ]QNOMATCH' +nul_match P P 0 '-i' '[æ]QNOMATCH' # Ensure that the matcher doesn't regress to something that stops at # \0 -nul_match 0 0 0 '-F' 'yQ[f]' -nul_match 0 0 0 '-Fi' 'YQ[F]' -nul_match 0 0 0 '' 'yQNOMATCH' -nul_match 0 0 0 '' 'QNOMATCH' -nul_match 0 0 0 '-i' 'YQNOMATCH' -nul_match 0 0 0 '-i' 'QNOMATCH' -nul_match 0 0 0 '-F' 'æQ[ð]' +nul_match P P P '-F' 'yQ[f]' +nul_match P P P '-Fi' 'YQ[F]' +nul_match P P 0 '' 'yQNOMATCH' +nul_match P P 0 '' 'QNOMATCH' +nul_match P P 0 '-i' 'YQNOMATCH' +nul_match P P 0 '-i' 'QNOMATCH' +nul_match P P P '-F' 'æQ[ð]' nul_match P P P '-Fi' 'ÆQ[Ð]' -nul_match P 0 1 '-i' 'ÆQ[Ð]' -nul_match 0 0 0 '' 'yQNÓMATCH' -nul_match 0 0 0 '' 'QNÓMATCH' -nul_match 0 0 0 '-i' 'YQNÓMATCH' -nul_match 0 0 0 '-i' 'QNÓMATCH' +nul_match P P 1 '-i' 'ÆQ[Ð]' +nul_match P P 0 '' 'yQNÓMATCH' +nul_match P P 0 '' 'QNÓMATCH' +nul_match P P 0 '-i' 'YQNÓMATCH' +nul_match P P 0 '-i' 'QNÓMATCH' test_done From 48de2a768cfdb5bde3af40e0518c96f7df90c0b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:20:59 +0200 Subject: [PATCH 09/18] grep: remove the kwset optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A later change will replace this optimization with optimistic use of PCRE v2. I'm completely removing it as an intermediate step, as opposed to replacing it with PCRE v2, to demonstrate that no grep semantics depend on this (or any other) optimization for the fixed backend anymore. For now this is mostly (but not entirely) a performance regression, as shown by this hacky one-liner: for opt in '' ' -i' do GIT_PERF_7821_GREP_OPTS=$opt GIT_PERF_REPEAT_COUNT=10 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 CFLAGS=-O3 USE_LIBPCRE=YesPlease' ./run origin/master HEAD -- p7821-grep-engines-fixed.sh done && for opt in '' ' -i' do GIT_PERF_4221_LOG_OPTS=$opt GIT_PERF_REPEAT_COUNT=10 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 CFLAGS=-O3 USE_LIBPCRE=YesPlease' ./run origin/master HEAD -- p4221-log-grep-engines-fixed.sh done Which produces: plain grep: Test origin/master HEAD ------------------------------------------------------------------------- 7821.1: fixed grep int 0.55(1.60+0.63) 0.82(3.11+0.51) +49.1% 7821.2: basic grep int 0.62(1.68+0.49) 0.85(3.02+0.52) +37.1% 7821.3: extended grep int 0.61(1.63+0.53) 0.91(3.09+0.44) +49.2% 7821.4: perl grep int 0.55(1.60+0.57) 0.41(0.93+0.57) -25.5% 7821.6: fixed grep uncommon 0.20(0.50+0.44) 0.35(1.27+0.42) +75.0% 7821.7: basic grep uncommon 0.20(0.49+0.45) 0.35(1.29+0.41) +75.0% 7821.8: extended grep uncommon 0.20(0.45+0.48) 0.35(1.25+0.44) +75.0% 7821.9: perl grep uncommon 0.20(0.53+0.41) 0.16(0.24+0.49) -20.0% 7821.11: fixed grep æ 0.35(1.27+0.40) 0.25(0.82+0.39) -28.6% 7821.12: basic grep æ 0.35(1.28+0.38) 0.25(0.75+0.44) -28.6% 7821.13: extended grep æ 0.36(1.21+0.46) 0.25(0.86+0.35) -30.6% 7821.14: perl grep æ 0.35(1.33+0.34) 0.16(0.26+0.47) -54.3% grep with -i: Test origin/master HEAD ----------------------------------------------------------------------------- 7821.1: fixed grep -i int 0.61(1.84+0.64) 1.11(4.12+0.64) +82.0% 7821.2: basic grep -i int 0.72(1.86+0.57) 1.15(4.48+0.49) +59.7% 7821.3: extended grep -i int 0.94(1.83+0.60) 1.53(4.12+0.58) +62.8% 7821.4: perl grep -i int 0.66(1.82+0.59) 0.55(1.08+0.58) -16.7% 7821.6: fixed grep -i uncommon 0.21(0.51+0.44) 0.44(1.74+0.34) +109.5% 7821.7: basic grep -i uncommon 0.21(0.55+0.41) 0.44(1.72+0.40) +109.5% 7821.8: extended grep -i uncommon 0.21(0.57+0.39) 0.42(1.64+0.45) +100.0% 7821.9: perl grep -i uncommon 0.21(0.48+0.48) 0.17(0.30+0.45) -19.0% 7821.11: fixed grep -i æ 0.25(0.73+0.45) 0.25(0.75+0.45) +0.0% 7821.12: basic grep -i æ 0.25(0.71+0.49) 0.26(0.77+0.44) +4.0% 7821.13: extended grep -i æ 0.25(0.75+0.44) 0.25(0.74+0.46) +0.0% 7821.14: perl grep -i æ 0.17(0.26+0.48) 0.16(0.20+0.52) -5.9% plain log: Test origin/master HEAD --------------------------------------------------------------------------------- 4221.1: fixed log --grep='int' 7.31(7.06+0.21) 8.11(7.85+0.20) +10.9% 4221.2: basic log --grep='int' 7.30(6.94+0.27) 8.16(7.89+0.19) +11.8% 4221.3: extended log --grep='int' 7.34(7.05+0.21) 8.08(7.76+0.25) +10.1% 4221.4: perl log --grep='int' 7.27(6.94+0.24) 7.05(6.76+0.25) -3.0% 4221.6: fixed log --grep='uncommon' 6.97(6.62+0.32) 7.86(7.51+0.30) +12.8% 4221.7: basic log --grep='uncommon' 7.05(6.69+0.29) 7.89(7.60+0.28) +11.9% 4221.8: extended log --grep='uncommon' 6.89(6.56+0.32) 7.99(7.66+0.24) +16.0% 4221.9: perl log --grep='uncommon' 7.02(6.66+0.33) 6.97(6.54+0.36) -0.7% 4221.11: fixed log --grep='æ' 7.37(7.03+0.33) 7.67(7.30+0.31) +4.1% 4221.12: basic log --grep='æ' 7.41(7.00+0.31) 7.60(7.28+0.26) +2.6% 4221.13: extended log --grep='æ' 7.35(6.96+0.38) 7.73(7.31+0.34) +5.2% 4221.14: perl log --grep='æ' 7.43(7.10+0.32) 6.95(6.61+0.27) -6.5% log with -i: Test origin/master HEAD ------------------------------------------------------------------------------------ 4221.1: fixed log -i --grep='int' 7.40(7.05+0.23) 8.66(8.38+0.20) +17.0% 4221.2: basic log -i --grep='int' 7.39(7.09+0.23) 8.67(8.39+0.20) +17.3% 4221.3: extended log -i --grep='int' 7.29(6.99+0.26) 8.69(8.31+0.26) +19.2% 4221.4: perl log -i --grep='int' 7.42(7.16+0.21) 7.14(6.80+0.24) -3.8% 4221.6: fixed log -i --grep='uncommon' 6.94(6.58+0.35) 8.43(8.04+0.30) +21.5% 4221.7: basic log -i --grep='uncommon' 6.95(6.62+0.31) 8.34(7.93+0.32) +20.0% 4221.8: extended log -i --grep='uncommon' 7.06(6.75+0.25) 8.32(7.98+0.31) +17.8% 4221.9: perl log -i --grep='uncommon' 6.96(6.69+0.26) 7.04(6.64+0.32) +1.1% 4221.11: fixed log -i --grep='æ' 7.92(7.55+0.33) 7.86(7.44+0.34) -0.8% 4221.12: basic log -i --grep='æ' 7.88(7.49+0.32) 7.84(7.46+0.34) -0.5% 4221.13: extended log -i --grep='æ' 7.91(7.51+0.32) 7.87(7.48+0.32) -0.5% 4221.14: perl log -i --grep='æ' 7.01(6.59+0.35) 6.99(6.64+0.28) -0.3% Some of those, as noted in [1] are because PCRE is faster at finding fixed strings. This looks bad for some engines, but in the next change we'll optimistically use PCRE v2 for all of these, so it'll look better. 1. https://public-inbox.org/git/87v9x793qi.fsf@evledraar.gmail.com/ Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 63 +++------------------------------------------------------- grep.h | 2 -- 2 files changed, 3 insertions(+), 62 deletions(-) diff --git a/grep.c b/grep.c index 8d0fff316c..4468519d5c 100644 --- a/grep.c +++ b/grep.c @@ -356,18 +356,6 @@ static NORETURN void compile_regexp_failed(const struct grep_pat *p, die("%s'%s': %s", where, p->pattern, error); } -static int is_fixed(const char *s, size_t len) -{ - size_t i; - - for (i = 0; i < len; i++) { - if (is_regex_special(s[i])) - return 0; - } - - return 1; -} - #ifdef USE_LIBPCRE1 static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) { @@ -643,38 +631,12 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; + p->fixed = opt->fixed; if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2) die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); - /* - * Even when -F (fixed) asks us to do a non-regexp search, we - * may not be able to correctly case-fold when -i - * (ignore-case) is asked (in which case, we'll synthesize a - * regexp to match the pattern that matches regexp special - * characters literally, while ignoring case differences). On - * the other hand, even without -F, if the pattern does not - * have any regexp special characters and there is no need for - * case-folding search, we can internally turn it into a - * simple string match using kws. p->fixed tells us if we - * want to use kws. - */ - if (opt->fixed || is_fixed(p->pattern, p->patternlen)) - p->fixed = !p->ignore_case || !has_non_ascii(p->pattern); - - if (p->fixed) { - p->kws = kwsalloc(p->ignore_case ? tolower_trans_tbl : NULL); - kwsincr(p->kws, p->pattern, p->patternlen); - kwsprep(p->kws); - return; - } - if (opt->fixed) { - /* - * We come here when the pattern has the non-ascii - * characters we cannot case-fold, and asked to - * ignore-case. - */ compile_fixed_regexp(p, opt); return; } @@ -1042,9 +1004,7 @@ void free_grep_patterns(struct grep_opt *opt) case GREP_PATTERN: /* atom */ case GREP_PATTERN_HEAD: case GREP_PATTERN_BODY: - if (p->kws) - kwsfree(p->kws); - else if (p->pcre1_regexp) + if (p->pcre1_regexp) free_pcre1_regexp(p); else if (p->pcre2_pattern) free_pcre2_pattern(p); @@ -1104,29 +1064,12 @@ static void show_name(struct grep_opt *opt, const char *name) opt->output(opt, opt->null_following_name ? "\0" : "\n", 1); } -static int fixmatch(struct grep_pat *p, char *line, char *eol, - regmatch_t *match) -{ - struct kwsmatch kwsm; - size_t offset = kwsexec(p->kws, line, eol - line, &kwsm); - if (offset == -1) { - match->rm_so = match->rm_eo = -1; - return REG_NOMATCH; - } else { - match->rm_so = offset; - match->rm_eo = match->rm_so + kwsm.size[0]; - return 0; - } -} - static int patmatch(struct grep_pat *p, char *line, char *eol, regmatch_t *match, int eflags) { int hit; - if (p->fixed) - hit = !fixmatch(p, line, eol, match); - else if (p->pcre1_regexp) + if (p->pcre1_regexp) hit = !pcre1match(p, line, eol, match, eflags); else if (p->pcre2_pattern) hit = !pcre2match(p, line, eol, match, eflags); diff --git a/grep.h b/grep.h index 4bb8a79d93..d35a137fcb 100644 --- a/grep.h +++ b/grep.h @@ -32,7 +32,6 @@ typedef int pcre2_compile_context; typedef int pcre2_match_context; typedef int pcre2_jit_stack; #endif -#include "kwset.h" #include "thread-utils.h" #include "userdiff.h" @@ -97,7 +96,6 @@ struct grep_pat { pcre2_match_context *pcre2_match_context; pcre2_jit_stack *pcre2_jit_stack; uint32_t pcre2_jit_on; - kwset_t kws; unsigned fixed:1; unsigned ignore_case:1; unsigned word_regexp:1; From b65abcafc7abd93ed634125bcec98b1460e75d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Mon, 1 Jul 2019 23:21:00 +0200 Subject: [PATCH 10/18] grep: use PCRE v2 for optimized fixed-string search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring back optimized fixed-string search for "grep", this time with PCRE v2 as an optional backend. As noted in [1] with kwset we were slower than PCRE v1 and v2 JIT with the kwset backend, so that optimization was counterproductive. This brings back the optimization for "--fixed-strings", without changing the semantics of having a NUL-byte in patterns. As seen in previous commits in this series we could support it now, but I'd rather just leave that edge-case aside so we don't have one behavior or the other depending what "--fixed-strings" backend we're using. It makes the behavior harder to understand and document, and makes tests for the different backends more painful. This does change the behavior under non-C locales when "log"'s "--encoding" option is used and the heystack/needle in the content/command-line doesn't have a matching encoding. See the recent change in "t4210: skip more command-line encoding tests on MinGW" in this series. I think that's OK. We did nothing sensible before then (just compared raw bytes that had no hope of matching). At least now the user will get some idea why their grep/log never matches in that edge case. I could also support the PCRE v1 backend here, but that would make the code more complex. I'd rather aim for simplicity here and in future changes to the diffcore. We're not going to have someone who absolutely must have faster search, but for whom building PCRE v2 isn't acceptable. The difference between this series of commits and the current "master" is, using the same t/perf commands shown in the last commit: plain grep: Test origin/master HEAD ------------------------------------------------------------------------- 7821.1: fixed grep int 0.55(1.67+0.56) 0.41(0.98+0.60) -25.5% 7821.2: basic grep int 0.58(1.65+0.52) 0.41(0.96+0.57) -29.3% 7821.3: extended grep int 0.57(1.66+0.49) 0.42(0.93+0.60) -26.3% 7821.4: perl grep int 0.54(1.67+0.50) 0.43(0.88+0.65) -20.4% 7821.6: fixed grep uncommon 0.21(0.52+0.42) 0.16(0.24+0.51) -23.8% 7821.7: basic grep uncommon 0.20(0.49+0.45) 0.17(0.28+0.47) -15.0% 7821.8: extended grep uncommon 0.20(0.54+0.39) 0.16(0.25+0.50) -20.0% 7821.9: perl grep uncommon 0.20(0.58+0.36) 0.16(0.23+0.50) -20.0% 7821.11: fixed grep æ 0.35(1.24+0.43) 0.16(0.23+0.50) -54.3% 7821.12: basic grep æ 0.36(1.29+0.38) 0.16(0.20+0.54) -55.6% 7821.13: extended grep æ 0.35(1.23+0.44) 0.16(0.24+0.50) -54.3% 7821.14: perl grep æ 0.35(1.33+0.34) 0.16(0.28+0.46) -54.3% grep with -i: Test origin/master HEAD ---------------------------------------------------------------------------- 7821.1: fixed grep -i int 0.62(1.81+0.70) 0.47(1.11+0.64) -24.2% 7821.2: basic grep -i int 0.67(1.90+0.53) 0.46(1.07+0.62) -31.3% 7821.3: extended grep -i int 0.62(1.92+0.53) 0.53(1.12+0.58) -14.5% 7821.4: perl grep -i int 0.66(1.85+0.58) 0.45(1.10+0.59) -31.8% 7821.6: fixed grep -i uncommon 0.21(0.54+0.43) 0.17(0.20+0.55) -19.0% 7821.7: basic grep -i uncommon 0.20(0.52+0.45) 0.17(0.29+0.48) -15.0% 7821.8: extended grep -i uncommon 0.21(0.52+0.44) 0.17(0.26+0.50) -19.0% 7821.9: perl grep -i uncommon 0.21(0.53+0.44) 0.17(0.20+0.56) -19.0% 7821.11: fixed grep -i æ 0.26(0.79+0.44) 0.16(0.29+0.46) -38.5% 7821.12: basic grep -i æ 0.26(0.79+0.42) 0.16(0.20+0.54) -38.5% 7821.13: extended grep -i æ 0.26(0.84+0.39) 0.16(0.24+0.50) -38.5% 7821.14: perl grep -i æ 0.16(0.24+0.49) 0.17(0.25+0.51) +6.3% plain log: Test origin/master HEAD -------------------------------------------------------------------------------- 4221.1: fixed log --grep='int' 7.24(6.95+0.28) 7.20(6.95+0.18) -0.6% 4221.2: basic log --grep='int' 7.31(6.97+0.22) 7.20(6.93+0.21) -1.5% 4221.3: extended log --grep='int' 7.37(7.04+0.24) 7.22(6.91+0.25) -2.0% 4221.4: perl log --grep='int' 7.31(7.04+0.21) 7.19(6.89+0.21) -1.6% 4221.6: fixed log --grep='uncommon' 6.93(6.59+0.32) 7.04(6.66+0.37) +1.6% 4221.7: basic log --grep='uncommon' 6.92(6.58+0.29) 7.08(6.75+0.29) +2.3% 4221.8: extended log --grep='uncommon' 6.92(6.55+0.31) 7.00(6.68+0.31) +1.2% 4221.9: perl log --grep='uncommon' 7.03(6.59+0.33) 7.12(6.73+0.34) +1.3% 4221.11: fixed log --grep='æ' 7.41(7.08+0.28) 7.05(6.76+0.29) -4.9% 4221.12: basic log --grep='æ' 7.39(6.99+0.33) 7.00(6.68+0.25) -5.3% 4221.13: extended log --grep='æ' 7.34(7.00+0.25) 7.15(6.81+0.31) -2.6% 4221.14: perl log --grep='æ' 7.43(7.13+0.26) 7.01(6.60+0.36) -5.7% log with -i: Test origin/master HEAD ------------------------------------------------------------------------------------ 4221.1: fixed log -i --grep='int' 7.31(7.07+0.24) 7.23(7.00+0.22) -1.1% 4221.2: basic log -i --grep='int' 7.40(7.08+0.28) 7.19(6.92+0.20) -2.8% 4221.3: extended log -i --grep='int' 7.43(7.13+0.25) 7.27(6.99+0.21) -2.2% 4221.4: perl log -i --grep='int' 7.34(7.10+0.24) 7.10(6.90+0.19) -3.3% 4221.6: fixed log -i --grep='uncommon' 7.07(6.71+0.32) 7.11(6.77+0.28) +0.6% 4221.7: basic log -i --grep='uncommon' 6.99(6.64+0.28) 7.12(6.69+0.38) +1.9% 4221.8: extended log -i --grep='uncommon' 7.11(6.74+0.32) 7.10(6.77+0.27) -0.1% 4221.9: perl log -i --grep='uncommon' 6.98(6.60+0.29) 7.05(6.64+0.34) +1.0% 4221.11: fixed log -i --grep='æ' 7.85(7.45+0.34) 7.03(6.68+0.32) -10.4% 4221.12: basic log -i --grep='æ' 7.87(7.49+0.29) 7.06(6.69+0.31) -10.3% 4221.13: extended log -i --grep='æ' 7.87(7.54+0.31) 7.09(6.69+0.31) -9.9% 4221.14: perl log -i --grep='æ' 7.06(6.77+0.28) 6.91(6.57+0.31) -2.1% So as with e05b027627 ("grep: use PCRE v2 for optimized fixed-string search", 2019-06-26) there's a huge improvement in performance for "grep", but in "log" most of our time is spent elsewhere, so we don't notice it that much. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/grep.c b/grep.c index 4468519d5c..fc0ed73ef3 100644 --- a/grep.c +++ b/grep.c @@ -356,6 +356,18 @@ static NORETURN void compile_regexp_failed(const struct grep_pat *p, die("%s'%s': %s", where, p->pattern, error); } +static int is_fixed(const char *s, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (is_regex_special(s[i])) + return 0; + } + + return 1; +} + #ifdef USE_LIBPCRE1 static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) { @@ -602,7 +614,6 @@ static int pcre2match(struct grep_pat *p, const char *line, const char *eol, static void free_pcre2_pattern(struct grep_pat *p) { } -#endif /* !USE_LIBPCRE2 */ static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt) { @@ -623,11 +634,13 @@ static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt) compile_regexp_failed(p, errbuf); } } +#endif /* !USE_LIBPCRE2 */ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { int err; int regflags = REG_NEWLINE; + int pat_is_fixed; p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; @@ -636,8 +649,42 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2) die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); - if (opt->fixed) { + pat_is_fixed = is_fixed(p->pattern, p->patternlen); + if (opt->fixed || pat_is_fixed) { +#ifdef USE_LIBPCRE2 + opt->pcre2 = 1; + if (pat_is_fixed) { + compile_pcre2_pattern(p, opt); + } else { + /* + * E.g. t7811-grep-open.sh relies on the + * pattern being restored. + */ + char *old_pattern = p->pattern; + size_t old_patternlen = p->patternlen; + struct strbuf sb = STRBUF_INIT; + + /* + * There is the PCRE2_LITERAL flag, but it's + * only in PCRE v2 10.30 and later. Needing to + * ifdef our way around that and dealing with + * it + PCRE2_MULTILINE being an error is more + * complex than just quoting this ourselves. + */ + strbuf_add(&sb, "\\Q", 2); + strbuf_add(&sb, p->pattern, p->patternlen); + strbuf_add(&sb, "\\E", 2); + + p->pattern = sb.buf; + p->patternlen = sb.len; + compile_pcre2_pattern(p, opt); + p->pattern = old_pattern; + p->patternlen = old_patternlen; + strbuf_release(&sb); + } +#else /* !USE_LIBPCRE2 */ compile_fixed_regexp(p, opt); +#endif /* !USE_LIBPCRE2 */ return; } From 04bef50c0175bba195443ea0edbf2991175cdd91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:11 +0200 Subject: [PATCH 11/18] grep: remove overly paranoid BUG(...) code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove code that would trigger if pcre_config() or pcre2_config() was so broken that "do we have JIT?" wouldn't return a boolean. I added this code back in fbaceaac47 ("grep: add support for the PCRE v1 JIT API", 2017-05-25) and then as noted in f002532784 ("grep: print the pcre2_jit_on value", 2019-07-22) incorrectly copy/pasted some of it in 94da9193a6 ("grep: add support for PCRE v2", 2017-06-01). Let's just remove this code. Being this paranoid about the pcre2?_config() function itself being broken is crossing the line into unreasonable paranoia. Reported-by: Beat Bolli Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/grep.c b/grep.c index fc0ed73ef3..95af88cb74 100644 --- a/grep.c +++ b/grep.c @@ -394,14 +394,11 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) #ifdef GIT_PCRE1_USE_JIT pcre_config(PCRE_CONFIG_JIT, &p->pcre1_jit_on); - if (p->pcre1_jit_on == 1) { + if (p->pcre1_jit_on) { p->pcre1_jit_stack = pcre_jit_stack_alloc(1, 1024 * 1024); if (!p->pcre1_jit_stack) die("Couldn't allocate PCRE JIT stack"); pcre_assign_jit_stack(p->pcre1_extra_info, NULL, p->pcre1_jit_stack); - } else if (p->pcre1_jit_on != 0) { - BUG("The pcre1_jit_on variable should be 0 or 1, not %d", - p->pcre1_jit_on); } #endif } @@ -510,7 +507,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on); - if (p->pcre2_jit_on == 1) { + if (p->pcre2_jit_on) { jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE); if (jitret) die("Couldn't JIT the PCRE2 pattern '%s', got '%d'\n", p->pattern, jitret); @@ -545,9 +542,6 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt if (!p->pcre2_match_context) die("Couldn't allocate PCRE2 match context"); pcre2_jit_stack_assign(p->pcre2_match_context, NULL, p->pcre2_jit_stack); - } else if (p->pcre2_jit_on != 0) { - BUG("The pcre2_jit_on variable should be 0 or 1, not %d", - p->pcre1_jit_on); } } From 34489239d0f920ddc3bfff1c4cfe2c13ad02b2cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:12 +0200 Subject: [PATCH 12/18] grep: stop "using" a custom JIT stack with PCRE v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As reported in [1] the code I added in 94da9193a6 ("grep: add support for PCRE v2", 2017-06-01) to use a custom JIT stack has never worked. It was incorrectly copy/pasted from code I added in fbaceaac47 ("grep: add support for the PCRE v1 JIT API", 2017-05-25), which did work. Thus our intention of starting with 1 byte of stack at a maximum of 1 MB didn't happen, we'd always use the 32 KB stack provided by PCRE v2's jit_machine_stack_exec()[2]. The reason I allocated a custom stack at all was this advice in pcrejit(3) (same in pcre2jit(3)): "By default, it uses 32KiB on the machine stack. However, some large or complicated patterns need more than this" Since we've haven't had any reports of users running into PCRE2_ERROR_JIT_STACKLIMIT in the wild I think we can safely assume that we can just use the library defaults instead and drop this code. This won't change with the wider use of PCRE v2 in ed0479ce3d ("Merge branch 'ab/no-kwset' into next", 2019-07-15), a fixed string search is not a "large or complicated pattern". For good measure I ran the performance test noted in 94da9193a6, although the command is simpler now due to my 0f50c8e32c ("Makefile: remove the NO_R_TO_GCC_LINKER flag", 2019-05-17): GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE2=Y CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst' ./run HEAD~ HEAD p7820-grep-engines.sh Just the /perl/ results are: Test HEAD~ HEAD --------------------------------------------------------------------------------------- 7820.3: perl grep 'how.to' 0.17(0.27+0.65) 0.17(0.24+0.68) +0.0% 7820.7: perl grep '^how to' 0.16(0.23+0.66) 0.16(0.23+0.67) +0.0% 7820.11: perl grep '[how] to' 0.18(0.35+0.62) 0.18(0.33+0.65) +0.0% 7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.17(0.45+0.54) 0.17(0.49+0.50) +0.0% 7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.16(0.33+0.58) 0.16(0.29+0.62) +0.0% So, as expected there's no change, and running with valgrind reveals that we have fewer allocations now. As noted in [3] there are known regexes that will fail with the lower stack limit, the way GNU grep fixed it is interesting, although I believe the implementation is overly verbose, they could make PCRE v2 handle that gradual re-allocation, that's what min/max memory is for. So we might end up bringing this back, I'm more inclined to just kick such cases upstairs to PCRE maintainers as a bug, perhaps they'll add some overall "just allocate more then" flag to make this easier. In any case there's no functional change here, we didn't have a custom stack, so let's apply this first, we can always revert it later. 1. https://public-inbox.org/git/20190721194052.15440-1-carenas@gmail.com/ 2. I didn't really intend to start with 1 byte, looking at the PCRE v2 code again what happened is that I cargo-culted some of PCRE v2's own test code which was meant to test re-allocations. It's more sane to start with say 32 KB with a max of 1 MB, as pcre2grep.c does. 3. https://public-inbox.org/git/CAPUEspjj+fG8QDmf=bZXktfpLgkgiu34HTjKLhm-cmEE04FE-A@mail.gmail.com/ Reported-by: Carlo Marcelo Arenas Belón Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 10 ---------- grep.h | 4 ---- 2 files changed, 14 deletions(-) diff --git a/grep.c b/grep.c index 95af88cb74..4b1e917ac5 100644 --- a/grep.c +++ b/grep.c @@ -534,14 +534,6 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt p->pcre2_jit_on = 0; return; } - - p->pcre2_jit_stack = pcre2_jit_stack_create(1, 1024 * 1024, NULL); - if (!p->pcre2_jit_stack) - die("Couldn't allocate PCRE2 JIT stack"); - p->pcre2_match_context = pcre2_match_context_create(NULL); - if (!p->pcre2_match_context) - die("Couldn't allocate PCRE2 match context"); - pcre2_jit_stack_assign(p->pcre2_match_context, NULL, p->pcre2_jit_stack); } } @@ -585,8 +577,6 @@ static void free_pcre2_pattern(struct grep_pat *p) pcre2_compile_context_free(p->pcre2_compile_context); pcre2_code_free(p->pcre2_pattern); pcre2_match_data_free(p->pcre2_match_data); - pcre2_jit_stack_free(p->pcre2_jit_stack); - pcre2_match_context_free(p->pcre2_match_context); } #else /* !USE_LIBPCRE2 */ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt) diff --git a/grep.h b/grep.h index d35a137fcb..4d8e300175 100644 --- a/grep.h +++ b/grep.h @@ -29,8 +29,6 @@ typedef int pcre_jit_stack; typedef int pcre2_code; typedef int pcre2_match_data; typedef int pcre2_compile_context; -typedef int pcre2_match_context; -typedef int pcre2_jit_stack; #endif #include "thread-utils.h" #include "userdiff.h" @@ -93,8 +91,6 @@ struct grep_pat { pcre2_code *pcre2_pattern; pcre2_match_data *pcre2_match_data; pcre2_compile_context *pcre2_compile_context; - pcre2_match_context *pcre2_match_context; - pcre2_jit_stack *pcre2_jit_stack; uint32_t pcre2_jit_on; unsigned fixed:1; unsigned ignore_case:1; From 685668faaae6daf5990068b198525491591aff87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:13 +0200 Subject: [PATCH 13/18] grep: stop using a custom JIT stack with PCRE v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify the PCRE v1 code for the same reasons as for the PCRE v2 code in the last commit. Unlike with v2 we actually used the custom stack in v1, but let's use PCRE's built-in 32 KB one instead, since experience with v2 shows that's enough. Most distros are already using v2 as a default, and the underlying sljit code is the same. Unfortunately we can't just pass a NULL to pcre_jit_exec() as with pcre2_jit_match(). Unlike the v2 function it doesn't support that. Instead we need to use the fatter pcre_exec() if we'd like the same behavior. This will make things slightly slower than on the fast-path function, but it's OK since we care less about v1 performance these days since we have and recommend v2. Running a similar performance test as what I ran in fbaceaac47 ("grep: add support for the PCRE v1 JIT API", 2017-05-25) via: GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE1=Y CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst' ./run HEAD~ HEAD p7820-grep-engines.sh Gives us this, just the /perl/ results: Test HEAD~ HEAD --------------------------------------------------------------------------------------- 7820.3: perl grep 'how.to' 0.19(0.67+0.52) 0.19(0.65+0.52) +0.0% 7820.7: perl grep '^how to' 0.19(0.78+0.44) 0.19(0.72+0.49) +0.0% 7820.11: perl grep '[how] to' 0.39(2.13+0.43) 0.40(2.10+0.46) +2.6% 7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.44(2.55+0.37) 0.45(2.47+0.41) +2.3% 7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.23(1.06+0.42) 0.22(1.03+0.43) -4.3% It will also implicitly re-enable UTF-8 validation for PCRE v1. As noted in [1] we now have cases as a result where PCRE v1 is more eager to error out. Subsequent patches will fix that for v2, and I think it's fair to tell v1 users "just upgrade" and not worry about that edge case for v1. 1. https://public-inbox.org/git/CAPUEsphZJ_Uv9o1-yDpjNLA_q-f7gWXz9g1gCY2pYAYN8ri40g@mail.gmail.com/ Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 28 +++++----------------------- grep.h | 5 ----- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/grep.c b/grep.c index 4b1e917ac5..9c2b259771 100644 --- a/grep.c +++ b/grep.c @@ -394,12 +394,6 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) #ifdef GIT_PCRE1_USE_JIT pcre_config(PCRE_CONFIG_JIT, &p->pcre1_jit_on); - if (p->pcre1_jit_on) { - p->pcre1_jit_stack = pcre_jit_stack_alloc(1, 1024 * 1024); - if (!p->pcre1_jit_stack) - die("Couldn't allocate PCRE JIT stack"); - pcre_assign_jit_stack(p->pcre1_extra_info, NULL, p->pcre1_jit_stack); - } #endif } @@ -411,18 +405,9 @@ static int pcre1match(struct grep_pat *p, const char *line, const char *eol, if (eflags & REG_NOTBOL) flags |= PCRE_NOTBOL; -#ifdef GIT_PCRE1_USE_JIT - if (p->pcre1_jit_on) { - ret = pcre_jit_exec(p->pcre1_regexp, p->pcre1_extra_info, line, - eol - line, 0, flags, ovector, - ARRAY_SIZE(ovector), p->pcre1_jit_stack); - } else -#endif - { - ret = pcre_exec(p->pcre1_regexp, p->pcre1_extra_info, line, - eol - line, 0, flags, ovector, - ARRAY_SIZE(ovector)); - } + ret = pcre_exec(p->pcre1_regexp, p->pcre1_extra_info, line, + eol - line, 0, flags, ovector, + ARRAY_SIZE(ovector)); if (ret < 0 && ret != PCRE_ERROR_NOMATCH) die("pcre_exec failed with error code %d", ret); @@ -439,14 +424,11 @@ static void free_pcre1_regexp(struct grep_pat *p) { pcre_free(p->pcre1_regexp); #ifdef GIT_PCRE1_USE_JIT - if (p->pcre1_jit_on) { + if (p->pcre1_jit_on) pcre_free_study(p->pcre1_extra_info); - pcre_jit_stack_free(p->pcre1_jit_stack); - } else + else #endif - { pcre_free(p->pcre1_extra_info); - } pcre_free((void *)p->pcre1_tables); } #else /* !USE_LIBPCRE1 */ diff --git a/grep.h b/grep.h index 4d8e300175..ce2d72571f 100644 --- a/grep.h +++ b/grep.h @@ -14,13 +14,9 @@ #ifndef GIT_PCRE_STUDY_JIT_COMPILE #define GIT_PCRE_STUDY_JIT_COMPILE 0 #endif -#if PCRE_MAJOR <= 8 && PCRE_MINOR < 20 -typedef int pcre_jit_stack; -#endif #else typedef int pcre; typedef int pcre_extra; -typedef int pcre_jit_stack; #endif #ifdef USE_LIBPCRE2 #define PCRE2_CODE_UNIT_WIDTH 8 @@ -85,7 +81,6 @@ struct grep_pat { regex_t regexp; pcre *pcre1_regexp; pcre_extra *pcre1_extra_info; - pcre_jit_stack *pcre1_jit_stack; const unsigned char *pcre1_tables; int pcre1_jit_on; pcre2_code *pcre2_pattern; From 8a35b540a99d909ee4680e773c1d3befb6bff782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:14 +0200 Subject: [PATCH 14/18] grep: consistently use "p->fixed" in compile_regexp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the start of this function we do: p->fixed = opt->fixed; It's less confusing to use that variable consistently that switch back & forth between the two. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grep.c b/grep.c index 9c2b259771..b94e998680 100644 --- a/grep.c +++ b/grep.c @@ -616,7 +616,7 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); pat_is_fixed = is_fixed(p->pattern, p->patternlen); - if (opt->fixed || pat_is_fixed) { + if (p->fixed || pat_is_fixed) { #ifdef USE_LIBPCRE2 opt->pcre2 = 1; if (pat_is_fixed) { From 09872f6418f6b6fc1b823d3b324907c02e9bc75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:15 +0200 Subject: [PATCH 15/18] grep: create a "is_fixed" member in "grep_pat" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change paves the way for later using this value the regex compile functions themselves. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 7 +++---- grep.h | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/grep.c b/grep.c index b94e998680..6d60e2e557 100644 --- a/grep.c +++ b/grep.c @@ -606,7 +606,6 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { int err; int regflags = REG_NEWLINE; - int pat_is_fixed; p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; @@ -615,11 +614,11 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2) die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); - pat_is_fixed = is_fixed(p->pattern, p->patternlen); - if (p->fixed || pat_is_fixed) { + p->is_fixed = is_fixed(p->pattern, p->patternlen); + if (p->fixed || p->is_fixed) { #ifdef USE_LIBPCRE2 opt->pcre2 = 1; - if (pat_is_fixed) { + if (p->is_fixed) { compile_pcre2_pattern(p, opt); } else { /* diff --git a/grep.h b/grep.h index ce2d72571f..c0c71eb4a9 100644 --- a/grep.h +++ b/grep.h @@ -88,6 +88,7 @@ struct grep_pat { pcre2_compile_context *pcre2_compile_context; uint32_t pcre2_jit_on; unsigned fixed:1; + unsigned is_fixed:1; unsigned ignore_case:1; unsigned word_regexp:1; }; From 8a5999838e3cc24652f09670b6fe9461a789721b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:16 +0200 Subject: [PATCH 16/18] grep: stess test PCRE v2 on invalid UTF-8 data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string search", 2019-07-01) we've been dying on invalid UTF-8 data when grepping for fixed strings if the following are all true: * The subject string is non-ASCII (e.g. "ævar") * We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C" * We compiled with PCRE v2 * That PCRE v2 did not have JIT support The last of those is why this wasn't caught earlier, per pcre2jit(3): "unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these checks do not happen on the JIT fast path, and if invalid data is passed, the result is undefined." I.e. the subject being matched against our pattern was invalid, but we were lucky and getting away with it on the JIT path, but the non-JIT one is stricter. This patch does nothing to fix that, instead we sneak in support for fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2 jit with implicit fixed-string matching for testing, see pcre2syntax(3) the syntax. This is technically a change in behavior, but it's so obscure that I figured it was OK. We'd previously consider this an invalid regular expression as regcomp() would die on it, now we feed it to the PCRE v2 fixed-string path. I thought this was better than introducing yet another GIT_TEST_* environment variable. We're also relying on a behavior of PCRE v2 that technically could change, but I think the test coverage is worth dipping our toe into some somewhat undefined behavior. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 10 ++++++++++ t/t7812-grep-icase-non-ascii.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/grep.c b/grep.c index 6d60e2e557..5bc0f4f32a 100644 --- a/grep.c +++ b/grep.c @@ -615,6 +615,16 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) die(_("given pattern contains NULL byte (via -f ). This is only supported with -P under PCRE v2")); p->is_fixed = is_fixed(p->pattern, p->patternlen); +#ifdef USE_LIBPCRE2 + if (!p->fixed && !p->is_fixed) { + const char *no_jit = "(*NO_JIT)"; + const int no_jit_len = strlen(no_jit); + if (starts_with(p->pattern, no_jit) && + is_fixed(p->pattern + no_jit_len, + p->patternlen - no_jit_len)) + p->is_fixed = 1; + } +#endif if (p->fixed || p->is_fixed) { #ifdef USE_LIBPCRE2 opt->pcre2 = 1; diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 0c685d3598..96c3572056 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -53,4 +53,32 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' ' test_cmp expected actual ' +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' ' + printf "\\200\\n" >invalid-0x80 && + echo "ævar" >expected && + cat expected >>invalid-0x80 && + git add invalid-0x80 +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UTF-8 data' ' + git grep -h "var" invalid-0x80 >actual && + test_cmp expected actual && + git grep -h "(*NO_JIT)var" invalid-0x80 >actual && + test_cmp expected actual +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data' ' + test_might_fail git grep -h "æ" invalid-0x80 >actual && + test_cmp expected actual && + test_must_fail git grep -h "(*NO_JIT)æ" invalid-0x80 && + test_cmp expected actual +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' ' + test_might_fail git grep -hi "Æ" invalid-0x80 >actual && + test_cmp expected actual && + test_must_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 && + test_cmp expected actual +' + test_done From 870eea81669bfff4333b37b11fedd870cd05fd90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Fri, 26 Jul 2019 17:08:17 +0200 Subject: [PATCH 17/18] grep: do not enter PCRE2_UTF mode on fixed matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed in the last commit partially fix a bug introduced in b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string search", 2019-07-01). Because PCRE v2, unlike kwset, validates its UTF-8 input we'd die on e.g.: fatal: pcre2_match failed with error code -22: UTF-8 error: isolated byte with 0x80 bit set When grepping a non-ASCII fixed string. This is a more general problem that's hard to fix, but we can at least fix the most common case of grepping for a fixed string without "-i". I can't think of a reason for why we'd turn on PCRE2_UTF when matching byte-for-byte like that. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- grep.c | 3 ++- t/t7812-grep-icase-non-ascii.sh | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/grep.c b/grep.c index 5bc0f4f32a..c7c06ae08d 100644 --- a/grep.c +++ b/grep.c @@ -472,7 +472,8 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } options |= PCRE2_CASELESS; } - if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern)) + if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) && + !(!opt->ignore_case && (p->fixed || p->is_fixed))) options |= PCRE2_UTF; p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern, diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 96c3572056..531eb59d57 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -68,9 +68,9 @@ test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UT ' test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data' ' - test_might_fail git grep -h "æ" invalid-0x80 >actual && + git grep -h "æ" invalid-0x80 >actual && test_cmp expected actual && - test_must_fail git grep -h "(*NO_JIT)æ" invalid-0x80 && + git grep -h "(*NO_JIT)æ" invalid-0x80 && test_cmp expected actual ' From c581e4a7499b9e1089847dbbc057afbef1ed861e Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Sun, 18 Aug 2019 22:17:27 +0200 Subject: [PATCH 18/18] grep: under --debug, show whether PCRE JIT is enabled This information is useful and not visible anywhere else, so show it. Signed-off-by: Beat Bolli Suggested-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- grep.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/grep.c b/grep.c index c7c06ae08d..76088784e3 100644 --- a/grep.c +++ b/grep.c @@ -394,6 +394,8 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt) #ifdef GIT_PCRE1_USE_JIT pcre_config(PCRE_CONFIG_JIT, &p->pcre1_jit_on); + if (opt->debug) + fprintf(stderr, "pcre1_jit_on=%d\n", p->pcre1_jit_on); #endif } @@ -490,6 +492,8 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on); + if (opt->debug) + fprintf(stderr, "pcre2_jit_on=%d\n", p->pcre2_jit_on); if (p->pcre2_jit_on) { jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE); if (jitret) @@ -515,6 +519,9 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt BUG("pcre2_pattern_info() failed: %d", patinforet); if (jitsizearg == 0) { p->pcre2_jit_on = 0; + if (opt->debug) + fprintf(stderr, "pcre2_jit_on=%d: (*NO_JIT) in regex\n", + p->pcre2_jit_on); return; } }