wine/dlls/ntdll/locale_private.h

729 lines
22 KiB
C
Raw Normal View History

/*
* Ntdll locale definitions
*
* Copyright 2019, 2022 Alexandre Julliard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
*/
#ifndef __NTDLL_LOCALE_PRIVATE_H
#define __NTDLL_LOCALE_PRIVATE_H
#include "windef.h"
#include "winbase.h"
#include "winnls.h"
/* NLS codepage file format:
*
* header:
* WORD offset to cp2uni table in words
* WORD CodePage
* WORD MaximumCharacterSize
* BYTE[2] DefaultChar
* WORD UniDefaultChar
* WORD TransDefaultChar
* WORD TransUniDefaultChar
* BYTE[12] LeadByte
* cp2uni table:
* WORD offset to uni2cp table in words
* WORD[256] cp2uni table
* WORD glyph table size
* WORD[glyph_table_size] glyph table
* WORD number of lead byte ranges
* WORD[256] lead byte offsets in words
* WORD[leadbytes][256] cp2uni table for lead bytes
* uni2cp table:
* WORD 0 / 4
* BYTE[65536] / WORD[65536] uni2cp table
*/
enum nls_section_type
{
NLS_SECTION_SORTKEYS = 9,
NLS_SECTION_CASEMAP = 10,
NLS_SECTION_CODEPAGE = 11,
NLS_SECTION_NORMALIZE = 12
};
/* NLS normalization file */
struct norm_table
{
WCHAR name[13]; /* 00 file name */
USHORT checksum[3]; /* 1a checksum? */
USHORT version[4]; /* 20 Unicode version */
USHORT form; /* 28 normalization form */
USHORT len_factor; /* 2a factor for length estimates */
USHORT unknown1; /* 2c */
USHORT decomp_size; /* 2e decomposition hash size */
USHORT comp_size; /* 30 composition hash size */
USHORT unknown2; /* 32 */
USHORT classes; /* 34 combining classes table offset */
USHORT props_level1; /* 36 char properties table level 1 offset */
USHORT props_level2; /* 38 char properties table level 2 offset */
USHORT decomp_hash; /* 3a decomposition hash table offset */
USHORT decomp_map; /* 3c decomposition character map table offset */
USHORT decomp_seq; /* 3e decomposition character sequences offset */
USHORT comp_hash; /* 40 composition hash table offset */
USHORT comp_seq; /* 42 composition character sequences offset */
/* BYTE[] combining class values */
/* BYTE[0x2200] char properties index level 1 */
/* BYTE[] char properties index level 2 */
/* WORD[] decomposition hash table */
/* WORD[] decomposition character map */
/* WORD[] decomposition character sequences */
/* WORD[] composition hash table */
/* WORD[] composition character sequences */
};
static inline int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
{
if (IS_HIGH_SURROGATE( src[0] ))
{
if (srclen <= 1) return 0;
if (!IS_LOW_SURROGATE( src[1] )) return 0;
*ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
return 2;
}
if (IS_LOW_SURROGATE( src[0] )) return 0;
*ch = src[0];
return 1;
}
static inline void put_utf16( WCHAR *dst, unsigned int ch )
{
if (ch >= 0x10000)
{
ch -= 0x10000;
dst[0] = 0xd800 | (ch >> 10);
dst[1] = 0xdc00 | (ch & 0x3ff);
}
else dst[0] = ch;
}
static inline unsigned int decode_utf8_char( unsigned char ch, const char **str, const char *strend )
{
/* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
static const char utf8_length[128] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */
};
/* first byte mask depending on UTF-8 sequence length */
static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
unsigned int len = utf8_length[ch - 0x80];
unsigned int res = ch & utf8_mask[len];
const char *end = *str + len;
if (end > strend)
{
*str = end;
return ~0;
}
switch (len)
{
case 3:
if ((ch = end[-3] ^ 0x80) >= 0x40) break;
res = (res << 6) | ch;
(*str)++;
if (res < 0x10) break;
case 2:
if ((ch = end[-2] ^ 0x80) >= 0x40) break;
res = (res << 6) | ch;
if (res >= 0x110000 >> 6) break;
(*str)++;
if (res < 0x20) break;
if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break;
case 1:
if ((ch = end[-1] ^ 0x80) >= 0x40) break;
res = (res << 6) | ch;
(*str)++;
if (res < 0x80) break;
return res;
}
return ~0;
}
static inline void init_codepage_table( USHORT *ptr, CPTABLEINFO *info )
{
USHORT hdr_size = ptr[0];
info->CodePage = ptr[1];
info->MaximumCharacterSize = ptr[2];
info->DefaultChar = ptr[3];
info->UniDefaultChar = ptr[4];
info->TransDefaultChar = ptr[5];
info->TransUniDefaultChar = ptr[6];
memcpy( info->LeadByte, ptr + 7, sizeof(info->LeadByte) );
ptr += hdr_size;
info->WideCharTable = ptr + ptr[0] + 1;
info->MultiByteTable = ++ptr;
ptr += 256;
if (*ptr++) ptr += 256; /* glyph table */
info->DBCSRanges = ptr;
if (*ptr) /* dbcs ranges */
{
info->DBCSCodePage = 1;
info->DBCSOffsets = ptr + 1;
}
else
{
info->DBCSCodePage = 0;
info->DBCSOffsets = NULL;
}
}
static inline unsigned int cp_mbstowcs_size( const CPTABLEINFO *info, const char *str, unsigned int len )
{
unsigned int res;
if (!info->DBCSCodePage) return len;
for (res = 0; len; len--, str++, res++)
{
if (info->DBCSOffsets[(unsigned char)*str] && len > 1)
{
str++;
len--;
}
}
return res;
}
static inline unsigned int cp_wcstombs_size( const CPTABLEINFO *info, const WCHAR *str, unsigned int len )
{
if (info->DBCSCodePage)
{
WCHAR *uni2cp = info->WideCharTable;
unsigned int res;
for (res = 0; len; len--, str++, res++)
if (uni2cp[*str] & 0xff00) res++;
return res;
}
else return len;
}
static inline NTSTATUS utf8_wcstombs_size( const WCHAR *src, unsigned int srclen, unsigned int *reslen )
{
unsigned int val, len;
NTSTATUS status = STATUS_SUCCESS;
for (len = 0; srclen; srclen--, src++)
{
if (*src < 0x80) len++; /* 0x00-0x7f: 1 byte */
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
else
{
if (!get_utf16( src, srclen, &val ))
{
val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;
}
if (val < 0x10000) len += 3; /* 0x800-0xffff: 3 bytes */
else /* 0x10000-0x10ffff: 4 bytes */
{
len += 4;
src++;
srclen--;
}
}
}
*reslen = len;
return status;
}
static inline NTSTATUS utf8_mbstowcs_size( const char *src, unsigned int srclen, unsigned int *reslen )
{
unsigned int res, len;
NTSTATUS status = STATUS_SUCCESS;
const char *srcend = src + srclen;
for (len = 0; src < srcend; len++)
{
unsigned char ch = *src++;
if (ch < 0x80) continue;
if ((res = decode_utf8_char( ch, &src, srcend )) > 0x10ffff)
status = STATUS_SOME_NOT_MAPPED;
else
if (res > 0xffff) len++;
}
*reslen = len;
return status;
}
static inline unsigned int cp_mbstowcs( const CPTABLEINFO *info, WCHAR *dst, unsigned int dstlen,
const char *src, unsigned int srclen )
{
unsigned int i, ret;
if (info->DBCSOffsets)
{
for (i = dstlen; srclen && i; i--, srclen--, src++, dst++)
{
USHORT off = info->DBCSOffsets[(unsigned char)*src];
if (off && srclen > 1)
{
src++;
srclen--;
*dst = info->DBCSOffsets[off + (unsigned char)*src];
}
else *dst = info->MultiByteTable[(unsigned char)*src];
}
ret = dstlen - i;
}
else
{
ret = min( srclen, dstlen );
for (i = 0; i < ret; i++) dst[i] = info->MultiByteTable[(unsigned char)src[i]];
}
return ret;
}
static inline unsigned int cp_wcstombs( const CPTABLEINFO *info, char *dst, unsigned int dstlen,
const WCHAR *src, unsigned int srclen )
{
unsigned int i, ret;
if (info->DBCSCodePage)
{
const WCHAR *uni2cp = info->WideCharTable;
for (i = dstlen; srclen && i; i--, srclen--, src++)
{
if (uni2cp[*src] & 0xff00)
{
if (i == 1) break; /* do not output a partial char */
i--;
*dst++ = uni2cp[*src] >> 8;
}
*dst++ = (char)uni2cp[*src];
}
ret = dstlen - i;
}
else
{
const char *uni2cp = info->WideCharTable;
ret = min( srclen, dstlen );
for (i = 0; i < ret; i++) dst[i] = uni2cp[src[i]];
}
return ret;
}
static inline NTSTATUS utf8_mbstowcs( WCHAR *dst, unsigned int dstlen, unsigned int *reslen,
const char *src, unsigned int srclen )
{
unsigned int res;
NTSTATUS status = STATUS_SUCCESS;
const char *srcend = src + srclen;
WCHAR *dstend = dst + dstlen;
while ((dst < dstend) && (src < srcend))
{
unsigned char ch = *src++;
if (ch < 0x80) /* special fast case for 7-bit ASCII */
{
*dst++ = ch;
continue;
}
if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
{
*dst++ = res;
}
else if (res <= 0x10ffff) /* we need surrogates */
{
res -= 0x10000;
*dst++ = 0xd800 | (res >> 10);
if (dst == dstend) break;
*dst++ = 0xdc00 | (res & 0x3ff);
}
else
{
*dst++ = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;
}
}
if (src < srcend) status = STATUS_BUFFER_TOO_SMALL; /* overflow */
*reslen = dstlen - (dstend - dst);
return status;
}
static inline NTSTATUS utf8_wcstombs( char *dst, unsigned int dstlen, unsigned int *reslen,
const WCHAR *src, unsigned int srclen )
{
char *end;
unsigned int val;
NTSTATUS status = STATUS_SUCCESS;
for (end = dst + dstlen; srclen; srclen--, src++)
{
WCHAR ch = *src;
if (ch < 0x80) /* 0x00-0x7f: 1 byte */
{
if (dst > end - 1) break;
*dst++ = ch;
continue;
}
if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
{
if (dst > end - 2) break;
dst[1] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[0] = 0xc0 | ch;
dst += 2;
continue;
}
if (!get_utf16( src, srclen, &val ))
{
val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;
}
if (val < 0x10000) /* 0x800-0xffff: 3 bytes */
{
if (dst > end - 3) break;
dst[2] = 0x80 | (val & 0x3f);
val >>= 6;
dst[1] = 0x80 | (val & 0x3f);
val >>= 6;
dst[0] = 0xe0 | val;
dst += 3;
}
else /* 0x10000-0x10ffff: 4 bytes */
{
if (dst > end - 4) break;
dst[3] = 0x80 | (val & 0x3f);
val >>= 6;
dst[2] = 0x80 | (val & 0x3f);
val >>= 6;
dst[1] = 0x80 | (val & 0x3f);
val >>= 6;
dst[0] = 0xf0 | val;
dst += 4;
src++;
srclen--;
}
}
if (srclen) status = STATUS_BUFFER_TOO_SMALL;
*reslen = dstlen - (end - dst);
return status;
}
#define HANGUL_SBASE 0xac00
#define HANGUL_LBASE 0x1100
#define HANGUL_VBASE 0x1161
#define HANGUL_TBASE 0x11a7
#define HANGUL_LCOUNT 19
#define HANGUL_VCOUNT 21
#define HANGUL_TCOUNT 28
#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
#define HANGUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT)
static inline const WCHAR *get_decomposition( const struct norm_table *info, unsigned int ch,
BYTE props, WCHAR *buffer, unsigned int *ret_len )
{
const struct pair { WCHAR src; USHORT dst; } *pairs;
const USHORT *hash_table = (const USHORT *)info + info->decomp_hash;
const WCHAR *ret;
unsigned int i, pos, end, len, hash;
/* default to no decomposition */
put_utf16( buffer, ch );
*ret_len = 1 + (ch >= 0x10000);
if (!props || props == 0x7f) return buffer;
if (props == 0xff) /* Hangul or invalid char */
{
if (ch >= HANGUL_SBASE && ch < HANGUL_SBASE + HANGUL_SCOUNT)
{
unsigned short sindex = ch - HANGUL_SBASE;
unsigned short tindex = sindex % HANGUL_TCOUNT;
buffer[0] = HANGUL_LBASE + sindex / HANGUL_NCOUNT;
buffer[1] = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT;
if (tindex) buffer[2] = HANGUL_TBASE + tindex;
*ret_len = 2 + !!tindex;
return buffer;
}
/* ignore other chars in Hangul range */
if (ch >= HANGUL_LBASE && ch < HANGUL_LBASE + 0x100) return buffer;
if (ch >= HANGUL_SBASE && ch < HANGUL_SBASE + 0x2c00) return buffer;
return NULL;
}
hash = ch % info->decomp_size;
pos = hash_table[hash];
if (pos >> 13)
{
if (props != 0xbf) return buffer;
ret = (const USHORT *)info + info->decomp_seq + (pos & 0x1fff);
len = pos >> 13;
}
else
{
pairs = (const struct pair *)((const USHORT *)info + info->decomp_map);
/* find the end of the hash bucket */
for (i = hash + 1; i < info->decomp_size; i++) if (!(hash_table[i] >> 13)) break;
if (i < info->decomp_size) end = hash_table[i];
else for (end = pos; pairs[end].src; end++) ;
for ( ; pos < end; pos++)
{
if (pairs[pos].src != (WCHAR)ch) continue;
ret = (const USHORT *)info + info->decomp_seq + (pairs[pos].dst & 0x1fff);
len = pairs[pos].dst >> 13;
break;
}
if (pos >= end) return buffer;
}
if (len == 7) while (ret[len]) len++;
if (!ret[0]) len = 0; /* ignored char */
*ret_len = len;
return ret;
}
static inline BYTE rol( BYTE val, BYTE count )
{
return (val << count) | (val >> (8 - count));
}
static inline BYTE get_char_props( const struct norm_table *info, unsigned int ch )
{
const BYTE *level1 = (const BYTE *)((const USHORT *)info + info->props_level1);
const BYTE *level2 = (const BYTE *)((const USHORT *)info + info->props_level2);
BYTE off = level1[ch / 128];
if (!off || off >= 0xfb) return rol( off, 5 );
return level2[(off - 1) * 128 + ch % 128];
}
static inline BYTE get_combining_class( const struct norm_table *info, unsigned int c )
{
const BYTE *classes = (const BYTE *)((const USHORT *)info + info->classes);
BYTE class = get_char_props( info, c ) & 0x3f;
if (class == 0x3f) return 0;
return classes[class];
}
static inline BOOL reorderable_pair( const struct norm_table *info, unsigned int c1, unsigned int c2 )
{
BYTE ccc1, ccc2;
/* reorderable if ccc1 > ccc2 > 0 */
ccc1 = get_combining_class( info, c1 );
if (ccc1 < 2) return FALSE;
ccc2 = get_combining_class( info, c2 );
return ccc2 && (ccc1 > ccc2);
}
static inline void canonical_order_substring( const struct norm_table *info, WCHAR *str, unsigned int len )
{
unsigned int i, ch1, ch2, len1, len2;
BOOL swapped;
do
{
swapped = FALSE;
for (i = 0; i < len - 1; i += len1)
{
if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
if (i + len1 >= len) break;
if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
if (reorderable_pair( info, ch1, ch2 ))
{
WCHAR tmp[2];
memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
swapped = TRUE;
i += len2 - len1;
}
}
} while (swapped);
}
/* reorder the string into canonical order - D108/D109 */
static inline void canonical_order_string( const struct norm_table *info, WCHAR *str, unsigned int len )
{
unsigned int ch, i, r, next = 0;
for (i = 0; i < len; i += r)
{
if (!(r = get_utf16( str + i, len - i, &ch ))) return;
if (i && !get_combining_class( info, ch ))
{
if (i > next + 1) /* at least two successive non-starters */
canonical_order_substring( info, str + next, i - next );
next = i + r;
}
}
if (i > next + 1) canonical_order_substring( info, str + next, i - next );
}
static inline NTSTATUS decompose_string( const struct norm_table *info, const WCHAR *src, int src_len,
WCHAR *dst, int *dst_len )
{
BYTE props;
int src_pos, dst_pos;
unsigned int ch, len, decomp_len;
WCHAR buffer[3];
const WCHAR *decomp;
for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len)
{
if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )))
{
*dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
return STATUS_NO_UNICODE_TRANSLATION;
}
props = get_char_props( info, ch );
if (!(decomp = get_decomposition( info, ch, props, buffer, &decomp_len )))
{
/* allow final null */
if (!ch && src_pos == src_len - 1 && dst_pos < *dst_len)
{
dst[dst_pos++] = 0;
break;
}
*dst_len = src_pos;
return STATUS_NO_UNICODE_TRANSLATION;
}
if (dst_pos + decomp_len > *dst_len)
{
*dst_len += (src_len - src_pos) * info->len_factor;
return STATUS_BUFFER_TOO_SMALL;
}
memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
dst_pos += decomp_len;
}
canonical_order_string( info, dst, dst_pos );
*dst_len = dst_pos;
return STATUS_SUCCESS;
}
static inline unsigned int compose_hangul( unsigned int ch1, unsigned int ch2 )
{
if (ch1 >= HANGUL_LBASE && ch1 < HANGUL_LBASE + HANGUL_LCOUNT)
{
int lindex = ch1 - HANGUL_LBASE;
int vindex = ch2 - HANGUL_VBASE;
if (vindex >= 0 && vindex < HANGUL_VCOUNT)
return HANGUL_SBASE + (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT;
}
if (ch1 >= HANGUL_SBASE && ch1 < HANGUL_SBASE + HANGUL_SCOUNT)
{
int sindex = ch1 - HANGUL_SBASE;
if (!(sindex % HANGUL_TCOUNT))
{
int tindex = ch2 - HANGUL_TBASE;
if (tindex > 0 && tindex < HANGUL_TCOUNT) return ch1 + tindex;
}
}
return 0;
}
static inline unsigned int compose_chars( const struct norm_table *info, unsigned int ch1, unsigned int ch2 )
{
const USHORT *table = (const USHORT *)info + info->comp_hash;
const WCHAR *chars = (const USHORT *)info + info->comp_seq;
unsigned int hash, start, end, i, len, ch[3];
hash = (ch1 + 95 * ch2) % info->comp_size;
start = table[hash];
end = table[hash + 1];
while (start < end)
{
for (i = 0; i < 3; i++, start += len) len = get_utf16( chars + start, end - start, ch + i );
if (ch[0] == ch1 && ch[1] == ch2) return ch[2];
}
return 0;
}
static inline unsigned int compose_string( const struct norm_table *info, WCHAR *str, unsigned int srclen )
{
unsigned int i, ch, comp, len, start_ch = 0, last_starter = srclen;
BYTE class, prev_class = 0;
for (i = 0; i < srclen; i += len)
{
if (!(len = get_utf16( str + i, srclen - i, &ch ))) return 0;
class = get_combining_class( info, ch );
if (last_starter == srclen || (prev_class && prev_class >= class) ||
(!(comp = compose_hangul( start_ch, ch )) &&
!(comp = compose_chars( info, start_ch, ch ))))
{
if (!class)
{
last_starter = i;
start_ch = ch;
}
prev_class = class;
}
else
{
int comp_len = 1 + (comp >= 0x10000);
int start_len = 1 + (start_ch >= 0x10000);
if (comp_len != start_len)
memmove( str + last_starter + comp_len, str + last_starter + start_len,
(i - (last_starter + start_len)) * sizeof(WCHAR) );
memmove( str + i + comp_len - start_len, str + i + len, (srclen - i - len) * sizeof(WCHAR) );
srclen += comp_len - start_len - len;
start_ch = comp;
i = last_starter;
len = comp_len;
prev_class = 0;
put_utf16( str + i, comp );
}
}
return srclen;
}
#endif /* __NTDLL_LOCALE_PRIVATE_H */