unicode: Add support for high Unicode planes in decomposition tables.

Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
Alexandre Julliard 2020-02-17 10:44:04 +01:00
parent 148f564d10
commit c658731975
5 changed files with 4478 additions and 1695 deletions

View file

@ -6257,17 +6257,19 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef ); SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) ); dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() ); ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 ); dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef ); SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) ); dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() ); ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
SetLastError( 0xdeadbeef ); SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 ); dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
todo_wine ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen ); todo_wine
ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
todo_wine_if (i == 0 || i == 2)
ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() ); ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
if (pRtlNormalizeString) if (pRtlNormalizeString)
{ {
@ -6277,17 +6279,18 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
dstlen = ARRAY_SIZE(dst); dstlen = ARRAY_SIZE(dst);
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen ); status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status ); ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
dstlen = 1; dstlen = 1;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen ); status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine_if( i == 0 || i == 2)
ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status ); ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
todo_wine_if (i != 3) todo_wine_if( i != 3)
ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
dstlen = 2; dstlen = 2;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen ); status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status ); ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
} }
} }

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch )
} }
static const WCHAR *get_decomposition( const unsigned short *table, WCHAR ch, unsigned int *len ) static const WCHAR *get_decomposition( const unsigned short *table, unsigned int ch, unsigned int *len )
{ {
unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf); unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
unsigned short start = table[offset]; unsigned short start = table[offset];
unsigned short end = table[offset + 1]; unsigned short end = table[offset + 1];
if ((*len = end - start)) return table + start; if ((*len = end - start)) return table + start;
*len = 1; *len = 1 + (ch >= 0x10000);
return NULL; return NULL;
} }
@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c )
} }
static BOOL is_starter( WCHAR c ) static BOOL is_starter( unsigned int c )
{ {
return !get_combining_class( c ); return !get_combining_class( c );
} }
static BOOL reorderable_pair( WCHAR c1, WCHAR c2 ) static BOOL reorderable_pair( unsigned int c1, unsigned int c2 )
{ {
BYTE ccc1, ccc2; BYTE ccc1, ccc2;
@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
return ccc2 && (ccc1 > ccc2); return ccc2 && (ccc1 > ccc2);
} }
static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
{
if (IS_HIGH_SURROGATE( src[0] ))
{
if (srclen <= 1) return 0;
if (!IS_LOW_SURROGATE( src[1] )) return 0;
*ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
return 2;
}
if (IS_LOW_SURROGATE( src[0] )) return 0;
*ch = src[0];
return 1;
}
static void put_utf16( WCHAR *dst, unsigned int ch )
{
if (ch >= 0x10000)
{
ch -= 0x10000;
dst[0] = 0xd800 | (ch >> 10);
dst[1] = 0xdc00 | (ch & 0x3ff);
}
else dst[0] = ch;
}
static void canonical_order_substring( WCHAR *str, unsigned int len ) static void canonical_order_substring( WCHAR *str, unsigned int len )
{ {
unsigned int i; unsigned int i, ch1, ch2, len1, len2;
BOOL swapped; BOOL swapped;
do do
{ {
swapped = FALSE; swapped = FALSE;
for (i = 0; i < len - 1; i++) for (i = 0; i < len - 1; i += len1)
{ {
if (reorderable_pair( str[i], str[i + 1] )) if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
if (i + len1 >= len) break;
if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
if (reorderable_pair( ch1, ch2 ))
{ {
WCHAR tmp = str[i]; WCHAR tmp[2];
str[i] = str[i + 1]; memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
str[i + 1] = tmp; memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
swapped = TRUE; swapped = TRUE;
i += len2 - len1;
} }
} }
} while (swapped); } while (swapped);
@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len )
*/ */
static void canonical_order_string( WCHAR *str, unsigned int len ) static void canonical_order_string( WCHAR *str, unsigned int len )
{ {
unsigned int i, next = 0; unsigned int ch, i, r, next = 0;
for (i = 1; i <= len; i++) for (i = 0; i < len; i += r)
{ {
if (i == len || is_starter( str[i] )) if (!(r = get_utf16( str + i, len - i, &ch ))) return;
if (i && is_starter( ch ))
{ {
if (i > next + 1) /* at least two successive non-starters */ if (i > next + 1) /* at least two successive non-starters */
canonical_order_substring( str + next, i - next ); canonical_order_substring( str + next, i - next );
next = i + 1; next = i + r;
} }
} }
if (i > next + 1) canonical_order_substring( str + next, i - next );
} }
static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len ) static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
{ {
const unsigned short *table = compat ? nfkd_table : nfd_table; const unsigned short *table = compat ? nfkd_table : nfd_table;
int src_pos, dst_pos = 0; int src_pos, dst_pos;
unsigned int decomp_len; unsigned int ch, len, decomp_len;
const WCHAR *decomp; const WCHAR *decomp;
for (src_pos = 0; src_pos < src_len; src_pos++) for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len, dst_pos += decomp_len)
{ {
if (dst_pos == *dst_len) break; if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )) ||
if ((decomp = get_decomposition( table, src[src_pos], &decomp_len ))) (ch >= 0xfdd0 && ch <= 0xfdef) || ((ch & 0xffff) >= 0xfffe))
{ {
if (dst_pos + decomp_len > *dst_len) break; *dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) ); return STATUS_NO_UNICODE_TRANSLATION;
} }
else dst[dst_pos] = src[src_pos]; decomp = get_decomposition( table, ch, &decomp_len );
dst_pos += decomp_len; if (dst_pos + decomp_len > *dst_len) break;
if (decomp) memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
else put_utf16( dst + dst_pos, ch );
} }
if (src_pos < src_len) if (src_pos < src_len)
{ {
*dst_len += (src_len - src_pos) * (compat ? 18 : 3); *dst_len += (src_len - src_pos) * (compat ? 18 : 3);
@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons
} }
/* get the next char value taking surrogates into account */
static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
{
if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
{
if (src[0] > 0xdbff || /* invalid high surrogate */
srclen <= 1 || /* missing low surrogate */
src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
return 0;
return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
}
return src[0];
}
/************************************************************************** /**************************************************************************
* RtlUnicodeToUTF8N (NTDLL.@) * RtlUnicodeToUTF8N (NTDLL.@)
*/ */
@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */ else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
else else
{ {
if (!(val = get_surrogate_value( src, srclen ))) if (!get_utf16( src, srclen, &val ))
{ {
val = 0xfffd; val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED; status = STATUS_SOME_NOT_MAPPED;
@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
dst += 2; dst += 2;
continue; continue;
} }
if (!(val = get_surrogate_value( src, srclen ))) if (!get_utf16( src, srclen, &val ))
{ {
val = 0xfffd; val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED; status = STATUS_SOME_NOT_MAPPED;

View file

@ -480,7 +480,7 @@ sub build_decompositions(@)
my @src = @_; my @src = @_;
my @dst; my @dst;
for (my $i = 0; $i < 65536; $i++) for (my $i = 0; $i < @src; $i++)
{ {
next unless defined $src[$i]; next unless defined $src[$i];
my @decomp = get_decomposition( $i, \@src ); my @decomp = get_decomposition( $i, \@src );
@ -2092,10 +2092,13 @@ sub dump_decompositions($@)
# first determine all the 16-char subsets that contain something # first determine all the 16-char subsets that contain something
my @filled = (0) x 4096; my $level1 = ($MAX_CHAR + 1) / 16;
my $level2 = $level1 / 16;
my @filled = (0) x $level1;
my $pos = 16; # for the null subset my $pos = 16; # for the null subset
my $data_total = 0; my $data_total = 0;
for (my $i = 0; $i < 65536; $i++) for (my $i = 0; $i <= $MAX_CHAR; $i++)
{ {
next unless defined $decomp[$i]; next unless defined $decomp[$i];
if ($filled[$i >> 4] == 0) if ($filled[$i >> 4] == 0)
@ -2109,9 +2112,9 @@ sub dump_decompositions($@)
# now count the 256-char subsets that contain something # now count the 256-char subsets that contain something
my @filled_idx = (256) x 256; my @filled_idx = ($level2) x $level2;
$pos = 256 + 16; $pos = $level2 + 16;
for (my $i = 0; $i < 4096; $i++) for (my $i = 0; $i < $level1; $i++)
{ {
next unless $filled[$i]; next unless $filled[$i];
$filled_idx[$i >> 4] = $pos; $filled_idx[$i >> 4] = $pos;
@ -2123,7 +2126,7 @@ sub dump_decompositions($@)
# add the index offsets to the subsets positions # add the index offsets to the subsets positions
for (my $i = 0; $i < 4096; $i++) for (my $i = 0; $i < $level1; $i++)
{ {
next unless $filled[$i]; next unless $filled[$i];
$filled[$i] += $null_offset; $filled[$i] += $null_offset;
@ -2138,9 +2141,9 @@ sub dump_decompositions($@)
# dump the second-level indexes # dump the second-level indexes
for (my $i = 0; $i < 256; $i++) for (my $i = 0; $i < $level2; $i++)
{ {
next unless ($filled_idx[$i] > 256); next unless ($filled_idx[$i] > $level2);
my @table = @filled[($i<<4)..($i<<4)+15]; my @table = @filled[($i<<4)..($i<<4)+15];
for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; } for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
printf OUTPUT ",\n /* sub-index %02x */\n", $i; printf OUTPUT ",\n /* sub-index %02x */\n", $i;
@ -2155,7 +2158,7 @@ sub dump_decompositions($@)
$pos = $total; $pos = $total;
my @data; my @data;
for (my $i = 0; $i < 4096; $i++) for (my $i = 0; $i < $level1; $i++)
{ {
next unless $filled[$i]; next unless $filled[$i];
my @table = (0) x (16); my @table = (0) x (16);