regex: mixed sets are misidentified as singletons

Fix "singleton" function used by regcomp() to turn character set matches
into exact character matches if a character set has exactly one
element.

The underlying cset representation is complex; most critically it
records"small" characters (codepoint less than either 128
or 256 depending on locale) in a bit vector, and "wide" characters in
a secondary array.

Unfortunately the "singleton" function uses to identify singleton sets
treated a cset as a singleton if either the "small" or the "wide" sets
had exactly one element (it would then ignore the other set).

The easiest way to demonstrate this bug:

	$ export LANG=C.UTF-8
	$ echo 'a' | grep '[abà]'

It should match (and print "a") but instead it doesn't match because the
single accented character in the set is misinterpreted as a singleton.

Reviewed by:	kevans, yuripv
Obtained from:	illumos
Differential Revision:	https://reviews.freebsd.org/D43149
This commit is contained in:
Bill Sommerfeld 2023-12-21 10:46:14 +07:00 committed by Yuri Pankov
parent 488e8a7fac
commit 8f7ed58a15
2 changed files with 62 additions and 6 deletions

View file

@ -1586,17 +1586,32 @@ singleton(cset *cs)
{
wint_t i, s, n;
/* Exclude the complicated cases we don't want to deal with */
if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
return (OUT);
if (cs->nwides > 1)
return (OUT);
/* Count the number of characters present in the bitmap */
for (i = n = 0; i < NC; i++)
if (CHIN(cs, i)) {
n++;
s = i;
}
if (n == 1)
return (s);
if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
cs->icase == 0)
if (n > 1)
return (OUT);
if (n == 1) {
if (cs->nwides == 0)
return (s);
else
return (OUT);
}
if (cs->nwides == 1)
return (cs->wides[0]);
/* Don't bother handling the other cases. */
return (OUT);
}

View file

@ -1,4 +1,3 @@
atf_test_case bmpat
bmpat_head()
{
@ -45,8 +44,50 @@ icase_body()
echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
}
atf_test_case mbset cleanup
mbset_head()
{
atf_set "descr" "Check multibyte sets matching"
}
mbset_body()
{
export LC_CTYPE="C.UTF-8"
# This involved an erroneously implemented optimization which reduces
# single-element sets to an exact match with a single codepoint.
# Match sets record small-codepoint characters in a bitmap and
# large-codepoint characters in an array; the optimization would falsely
# trigger if either the bitmap or the array was a singleton, ignoring
# the members of the other side of the set.
#
# To exercise this, we construct sets which have one member of one side
# and one or more of the other, and verify that all members can be
# found.
printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
}
mbset_cleanup()
{
rm -f mbset
}
atf_init_test_cases()
{
atf_add_test_case bmpat
atf_add_test_case icase
atf_add_test_case mbset
}