mirror of
git://source.winehq.org/git/wine.git
synced 2024-11-01 09:50:52 +00:00
unicode: Add support for high Unicode planes in decomposition tables.
Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
parent
148f564d10
commit
c658731975
5 changed files with 4478 additions and 1695 deletions
|
@ -6257,17 +6257,19 @@ static void test_NormalizeString(void)
|
||||||
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
|
||||||
SetLastError( 0xdeadbeef );
|
SetLastError( 0xdeadbeef );
|
||||||
dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
|
dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
|
||||||
todo_wine ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
|
||||||
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
|
ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
|
||||||
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
|
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
|
||||||
ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
|
||||||
SetLastError( 0xdeadbeef );
|
SetLastError( 0xdeadbeef );
|
||||||
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
|
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
|
||||||
todo_wine ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
|
||||||
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
|
ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
|
||||||
SetLastError( 0xdeadbeef );
|
SetLastError( 0xdeadbeef );
|
||||||
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
|
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
|
||||||
todo_wine ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
|
todo_wine
|
||||||
|
ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
|
||||||
|
todo_wine_if (i == 0 || i == 2)
|
||||||
ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
|
ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
|
||||||
if (pRtlNormalizeString)
|
if (pRtlNormalizeString)
|
||||||
{
|
{
|
||||||
|
@ -6277,17 +6279,18 @@ static void test_NormalizeString(void)
|
||||||
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
|
||||||
dstlen = ARRAY_SIZE(dst);
|
dstlen = ARRAY_SIZE(dst);
|
||||||
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
|
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
|
||||||
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
|
ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
|
||||||
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
|
||||||
dstlen = 1;
|
dstlen = 1;
|
||||||
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
|
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
|
||||||
|
todo_wine_if( i == 0 || i == 2)
|
||||||
ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
|
ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
|
||||||
todo_wine_if (i != 3)
|
todo_wine_if( i != 3)
|
||||||
ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
|
||||||
dstlen = 2;
|
dstlen = 2;
|
||||||
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
|
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
|
||||||
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
|
ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
|
||||||
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
|
ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch )
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static const WCHAR *get_decomposition( const unsigned short *table, WCHAR ch, unsigned int *len )
|
static const WCHAR *get_decomposition( const unsigned short *table, unsigned int ch, unsigned int *len )
|
||||||
{
|
{
|
||||||
unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
|
unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
|
||||||
unsigned short start = table[offset];
|
unsigned short start = table[offset];
|
||||||
unsigned short end = table[offset + 1];
|
unsigned short end = table[offset + 1];
|
||||||
|
|
||||||
if ((*len = end - start)) return table + start;
|
if ((*len = end - start)) return table + start;
|
||||||
*len = 1;
|
*len = 1 + (ch >= 0x10000);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c )
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static BOOL is_starter( WCHAR c )
|
static BOOL is_starter( unsigned int c )
|
||||||
{
|
{
|
||||||
return !get_combining_class( c );
|
return !get_combining_class( c );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
|
static BOOL reorderable_pair( unsigned int c1, unsigned int c2 )
|
||||||
{
|
{
|
||||||
BYTE ccc1, ccc2;
|
BYTE ccc1, ccc2;
|
||||||
|
|
||||||
|
@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
|
||||||
return ccc2 && (ccc1 > ccc2);
|
return ccc2 && (ccc1 > ccc2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
|
||||||
|
{
|
||||||
|
if (IS_HIGH_SURROGATE( src[0] ))
|
||||||
|
{
|
||||||
|
if (srclen <= 1) return 0;
|
||||||
|
if (!IS_LOW_SURROGATE( src[1] )) return 0;
|
||||||
|
*ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if (IS_LOW_SURROGATE( src[0] )) return 0;
|
||||||
|
*ch = src[0];
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void put_utf16( WCHAR *dst, unsigned int ch )
|
||||||
|
{
|
||||||
|
if (ch >= 0x10000)
|
||||||
|
{
|
||||||
|
ch -= 0x10000;
|
||||||
|
dst[0] = 0xd800 | (ch >> 10);
|
||||||
|
dst[1] = 0xdc00 | (ch & 0x3ff);
|
||||||
|
}
|
||||||
|
else dst[0] = ch;
|
||||||
|
}
|
||||||
|
|
||||||
static void canonical_order_substring( WCHAR *str, unsigned int len )
|
static void canonical_order_substring( WCHAR *str, unsigned int len )
|
||||||
{
|
{
|
||||||
unsigned int i;
|
unsigned int i, ch1, ch2, len1, len2;
|
||||||
BOOL swapped;
|
BOOL swapped;
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
swapped = FALSE;
|
swapped = FALSE;
|
||||||
for (i = 0; i < len - 1; i++)
|
for (i = 0; i < len - 1; i += len1)
|
||||||
{
|
{
|
||||||
if (reorderable_pair( str[i], str[i + 1] ))
|
if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
|
||||||
|
if (i + len1 >= len) break;
|
||||||
|
if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
|
||||||
|
if (reorderable_pair( ch1, ch2 ))
|
||||||
{
|
{
|
||||||
WCHAR tmp = str[i];
|
WCHAR tmp[2];
|
||||||
str[i] = str[i + 1];
|
memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
|
||||||
str[i + 1] = tmp;
|
memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
|
||||||
|
memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
|
||||||
swapped = TRUE;
|
swapped = TRUE;
|
||||||
|
i += len2 - len1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (swapped);
|
} while (swapped);
|
||||||
|
@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len )
|
||||||
*/
|
*/
|
||||||
static void canonical_order_string( WCHAR *str, unsigned int len )
|
static void canonical_order_string( WCHAR *str, unsigned int len )
|
||||||
{
|
{
|
||||||
unsigned int i, next = 0;
|
unsigned int ch, i, r, next = 0;
|
||||||
|
|
||||||
for (i = 1; i <= len; i++)
|
for (i = 0; i < len; i += r)
|
||||||
{
|
{
|
||||||
if (i == len || is_starter( str[i] ))
|
if (!(r = get_utf16( str + i, len - i, &ch ))) return;
|
||||||
|
if (i && is_starter( ch ))
|
||||||
{
|
{
|
||||||
if (i > next + 1) /* at least two successive non-starters */
|
if (i > next + 1) /* at least two successive non-starters */
|
||||||
canonical_order_substring( str + next, i - next );
|
canonical_order_substring( str + next, i - next );
|
||||||
next = i + 1;
|
next = i + r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (i > next + 1) canonical_order_substring( str + next, i - next );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
|
static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
|
||||||
{
|
{
|
||||||
const unsigned short *table = compat ? nfkd_table : nfd_table;
|
const unsigned short *table = compat ? nfkd_table : nfd_table;
|
||||||
int src_pos, dst_pos = 0;
|
int src_pos, dst_pos;
|
||||||
unsigned int decomp_len;
|
unsigned int ch, len, decomp_len;
|
||||||
const WCHAR *decomp;
|
const WCHAR *decomp;
|
||||||
|
|
||||||
for (src_pos = 0; src_pos < src_len; src_pos++)
|
for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len, dst_pos += decomp_len)
|
||||||
{
|
{
|
||||||
if (dst_pos == *dst_len) break;
|
if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )) ||
|
||||||
if ((decomp = get_decomposition( table, src[src_pos], &decomp_len )))
|
(ch >= 0xfdd0 && ch <= 0xfdef) || ((ch & 0xffff) >= 0xfffe))
|
||||||
{
|
{
|
||||||
if (dst_pos + decomp_len > *dst_len) break;
|
*dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
|
||||||
memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
|
return STATUS_NO_UNICODE_TRANSLATION;
|
||||||
}
|
}
|
||||||
else dst[dst_pos] = src[src_pos];
|
decomp = get_decomposition( table, ch, &decomp_len );
|
||||||
dst_pos += decomp_len;
|
if (dst_pos + decomp_len > *dst_len) break;
|
||||||
|
if (decomp) memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
|
||||||
|
else put_utf16( dst + dst_pos, ch );
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src_pos < src_len)
|
if (src_pos < src_len)
|
||||||
{
|
{
|
||||||
*dst_len += (src_len - src_pos) * (compat ? 18 : 3);
|
*dst_len += (src_len - src_pos) * (compat ? 18 : 3);
|
||||||
|
@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* get the next char value taking surrogates into account */
|
|
||||||
static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
|
|
||||||
{
|
|
||||||
if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
|
|
||||||
{
|
|
||||||
if (src[0] > 0xdbff || /* invalid high surrogate */
|
|
||||||
srclen <= 1 || /* missing low surrogate */
|
|
||||||
src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
|
|
||||||
return 0;
|
|
||||||
return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
|
|
||||||
}
|
|
||||||
return src[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**************************************************************************
|
/**************************************************************************
|
||||||
* RtlUnicodeToUTF8N (NTDLL.@)
|
* RtlUnicodeToUTF8N (NTDLL.@)
|
||||||
*/
|
*/
|
||||||
|
@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
|
||||||
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
|
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (!(val = get_surrogate_value( src, srclen )))
|
if (!get_utf16( src, srclen, &val ))
|
||||||
{
|
{
|
||||||
val = 0xfffd;
|
val = 0xfffd;
|
||||||
status = STATUS_SOME_NOT_MAPPED;
|
status = STATUS_SOME_NOT_MAPPED;
|
||||||
|
@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
|
||||||
dst += 2;
|
dst += 2;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!(val = get_surrogate_value( src, srclen )))
|
if (!get_utf16( src, srclen, &val ))
|
||||||
{
|
{
|
||||||
val = 0xfffd;
|
val = 0xfffd;
|
||||||
status = STATUS_SOME_NOT_MAPPED;
|
status = STATUS_SOME_NOT_MAPPED;
|
||||||
|
|
|
@ -480,7 +480,7 @@ sub build_decompositions(@)
|
||||||
my @src = @_;
|
my @src = @_;
|
||||||
my @dst;
|
my @dst;
|
||||||
|
|
||||||
for (my $i = 0; $i < 65536; $i++)
|
for (my $i = 0; $i < @src; $i++)
|
||||||
{
|
{
|
||||||
next unless defined $src[$i];
|
next unless defined $src[$i];
|
||||||
my @decomp = get_decomposition( $i, \@src );
|
my @decomp = get_decomposition( $i, \@src );
|
||||||
|
@ -2092,10 +2092,13 @@ sub dump_decompositions($@)
|
||||||
|
|
||||||
# first determine all the 16-char subsets that contain something
|
# first determine all the 16-char subsets that contain something
|
||||||
|
|
||||||
my @filled = (0) x 4096;
|
my $level1 = ($MAX_CHAR + 1) / 16;
|
||||||
|
my $level2 = $level1 / 16;
|
||||||
|
|
||||||
|
my @filled = (0) x $level1;
|
||||||
my $pos = 16; # for the null subset
|
my $pos = 16; # for the null subset
|
||||||
my $data_total = 0;
|
my $data_total = 0;
|
||||||
for (my $i = 0; $i < 65536; $i++)
|
for (my $i = 0; $i <= $MAX_CHAR; $i++)
|
||||||
{
|
{
|
||||||
next unless defined $decomp[$i];
|
next unless defined $decomp[$i];
|
||||||
if ($filled[$i >> 4] == 0)
|
if ($filled[$i >> 4] == 0)
|
||||||
|
@ -2109,9 +2112,9 @@ sub dump_decompositions($@)
|
||||||
|
|
||||||
# now count the 256-char subsets that contain something
|
# now count the 256-char subsets that contain something
|
||||||
|
|
||||||
my @filled_idx = (256) x 256;
|
my @filled_idx = ($level2) x $level2;
|
||||||
$pos = 256 + 16;
|
$pos = $level2 + 16;
|
||||||
for (my $i = 0; $i < 4096; $i++)
|
for (my $i = 0; $i < $level1; $i++)
|
||||||
{
|
{
|
||||||
next unless $filled[$i];
|
next unless $filled[$i];
|
||||||
$filled_idx[$i >> 4] = $pos;
|
$filled_idx[$i >> 4] = $pos;
|
||||||
|
@ -2123,7 +2126,7 @@ sub dump_decompositions($@)
|
||||||
|
|
||||||
# add the index offsets to the subsets positions
|
# add the index offsets to the subsets positions
|
||||||
|
|
||||||
for (my $i = 0; $i < 4096; $i++)
|
for (my $i = 0; $i < $level1; $i++)
|
||||||
{
|
{
|
||||||
next unless $filled[$i];
|
next unless $filled[$i];
|
||||||
$filled[$i] += $null_offset;
|
$filled[$i] += $null_offset;
|
||||||
|
@ -2138,9 +2141,9 @@ sub dump_decompositions($@)
|
||||||
|
|
||||||
# dump the second-level indexes
|
# dump the second-level indexes
|
||||||
|
|
||||||
for (my $i = 0; $i < 256; $i++)
|
for (my $i = 0; $i < $level2; $i++)
|
||||||
{
|
{
|
||||||
next unless ($filled_idx[$i] > 256);
|
next unless ($filled_idx[$i] > $level2);
|
||||||
my @table = @filled[($i<<4)..($i<<4)+15];
|
my @table = @filled[($i<<4)..($i<<4)+15];
|
||||||
for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
|
for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
|
||||||
printf OUTPUT ",\n /* sub-index %02x */\n", $i;
|
printf OUTPUT ",\n /* sub-index %02x */\n", $i;
|
||||||
|
@ -2155,7 +2158,7 @@ sub dump_decompositions($@)
|
||||||
$pos = $total;
|
$pos = $total;
|
||||||
|
|
||||||
my @data;
|
my @data;
|
||||||
for (my $i = 0; $i < 4096; $i++)
|
for (my $i = 0; $i < $level1; $i++)
|
||||||
{
|
{
|
||||||
next unless $filled[$i];
|
next unless $filled[$i];
|
||||||
my @table = (0) x (16);
|
my @table = (0) x (16);
|
||||||
|
|
Loading…
Reference in a new issue