/* * Ntdll locale definitions * * Copyright 2019, 2022 Alexandre Julliard * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA */ #ifndef __NTDLL_LOCALE_PRIVATE_H #define __NTDLL_LOCALE_PRIVATE_H #include "windef.h" #include "winbase.h" #include "winnls.h" /* NLS codepage file format: * * header: * WORD offset to cp2uni table in words * WORD CodePage * WORD MaximumCharacterSize * BYTE[2] DefaultChar * WORD UniDefaultChar * WORD TransDefaultChar * WORD TransUniDefaultChar * BYTE[12] LeadByte * cp2uni table: * WORD offset to uni2cp table in words * WORD[256] cp2uni table * WORD glyph table size * WORD[glyph_table_size] glyph table * WORD number of lead byte ranges * WORD[256] lead byte offsets in words * WORD[leadbytes][256] cp2uni table for lead bytes * uni2cp table: * WORD 0 / 4 * BYTE[65536] / WORD[65536] uni2cp table */ enum nls_section_type { NLS_SECTION_SORTKEYS = 9, NLS_SECTION_CASEMAP = 10, NLS_SECTION_CODEPAGE = 11, NLS_SECTION_NORMALIZE = 12 }; /* NLS normalization file */ struct norm_table { WCHAR name[13]; /* 00 file name */ USHORT checksum[3]; /* 1a checksum? */ USHORT version[4]; /* 20 Unicode version */ USHORT form; /* 28 normalization form */ USHORT len_factor; /* 2a factor for length estimates */ USHORT unknown1; /* 2c */ USHORT decomp_size; /* 2e decomposition hash size */ USHORT comp_size; /* 30 composition hash size */ USHORT unknown2; /* 32 */ USHORT classes; /* 34 combining classes table offset */ USHORT props_level1; /* 36 char properties table level 1 offset */ USHORT props_level2; /* 38 char properties table level 2 offset */ USHORT decomp_hash; /* 3a decomposition hash table offset */ USHORT decomp_map; /* 3c decomposition character map table offset */ USHORT decomp_seq; /* 3e decomposition character sequences offset */ USHORT comp_hash; /* 40 composition hash table offset */ USHORT comp_seq; /* 42 composition character sequences offset */ /* BYTE[] combining class values */ /* BYTE[0x2200] char properties index level 1 */ /* BYTE[] char properties index level 2 */ /* WORD[] decomposition hash table */ /* WORD[] decomposition character map */ /* WORD[] decomposition character sequences */ /* WORD[] composition hash table */ /* WORD[] composition character sequences */ }; /* locale.nls file */ struct locale_nls_header { UINT ctypes; UINT unknown1; UINT unknown2; UINT unknown3; UINT locales; UINT charmaps; UINT geoids; UINT scripts; }; static inline WCHAR casemap_ascii( WCHAR ch ) { if (ch >= 'a' && ch <= 'z') ch -= 'a' - 'A'; return ch; } static inline int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch ) { if (IS_HIGH_SURROGATE( src[0] )) { if (srclen <= 1) return 0; if (!IS_LOW_SURROGATE( src[1] )) return 0; *ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff); return 2; } if (IS_LOW_SURROGATE( src[0] )) return 0; *ch = src[0]; return 1; } static inline void put_utf16( WCHAR *dst, unsigned int ch ) { if (ch >= 0x10000) { ch -= 0x10000; dst[0] = 0xd800 | (ch >> 10); dst[1] = 0xdc00 | (ch & 0x3ff); } else dst[0] = ch; } static inline unsigned int decode_utf8_char( unsigned char ch, const char **str, const char *strend ) { /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */ static const char utf8_length[128] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */ 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */ 3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */ }; /* first byte mask depending on UTF-8 sequence length */ static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 }; unsigned int len = utf8_length[ch - 0x80]; unsigned int res = ch & utf8_mask[len]; const char *end = *str + len; if (end > strend) { *str = end; return ~0; } switch (len) { case 3: if ((ch = end[-3] ^ 0x80) >= 0x40) break; res = (res << 6) | ch; (*str)++; if (res < 0x10) break; case 2: if ((ch = end[-2] ^ 0x80) >= 0x40) break; res = (res << 6) | ch; if (res >= 0x110000 >> 6) break; (*str)++; if (res < 0x20) break; if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break; case 1: if ((ch = end[-1] ^ 0x80) >= 0x40) break; res = (res << 6) | ch; (*str)++; if (res < 0x80) break; return res; } return ~0; } static inline void init_codepage_table( USHORT *ptr, CPTABLEINFO *info ) { USHORT hdr_size = ptr[0]; info->CodePage = ptr[1]; info->MaximumCharacterSize = ptr[2]; info->DefaultChar = ptr[3]; info->UniDefaultChar = ptr[4]; info->TransDefaultChar = ptr[5]; info->TransUniDefaultChar = ptr[6]; memcpy( info->LeadByte, ptr + 7, sizeof(info->LeadByte) ); ptr += hdr_size; info->WideCharTable = ptr + ptr[0] + 1; info->MultiByteTable = ++ptr; ptr += 256; if (*ptr++) ptr += 256; /* glyph table */ info->DBCSRanges = ptr; if (*ptr) /* dbcs ranges */ { info->DBCSCodePage = 1; info->DBCSOffsets = ptr + 1; } else { info->DBCSCodePage = 0; info->DBCSOffsets = NULL; } } static inline int compare_locale_names( const WCHAR *n1, const WCHAR *n2 ) { for (;;) { WCHAR ch1 = casemap_ascii( *n1++ ); WCHAR ch2 = casemap_ascii( *n2++ ); if (ch1 == '_') ch1 = '-'; if (ch2 == '_') ch2 = '-'; if (!ch1 || ch1 != ch2) return ch1 - ch2; } } static inline const NLS_LOCALE_LCNAME_INDEX *find_lcname_entry( const NLS_LOCALE_HEADER *header, const WCHAR *name ) { const WCHAR *strings = (const WCHAR *)((char *)header + header->strings_offset); const NLS_LOCALE_LCNAME_INDEX *index = (const NLS_LOCALE_LCNAME_INDEX *)((char *)header + header->lcnames_offset); int min = 0, max = header->nb_lcnames - 1; if (!name) return NULL; while (min <= max) { int res, pos = (min + max) / 2; const WCHAR *str = strings + index[pos].name; res = compare_locale_names( name, str + 1 ); if (res < 0) max = pos - 1; else if (res > 0) min = pos + 1; else return &index[pos]; } return NULL; } static inline const NLS_LOCALE_LCID_INDEX *find_lcid_entry( const NLS_LOCALE_HEADER *header, LCID lcid ) { const NLS_LOCALE_LCID_INDEX *index = (const NLS_LOCALE_LCID_INDEX *)((char *)header + header->lcids_offset); int min = 0, max = header->nb_lcids - 1; while (min <= max) { int pos = (min + max) / 2; if (lcid < index[pos].id) max = pos - 1; else if (lcid > index[pos].id) min = pos + 1; else return &index[pos]; } return NULL; } static inline const NLS_LOCALE_DATA *get_locale_data( const NLS_LOCALE_HEADER *header, UINT idx ) { ULONG offset = header->locales_offset + idx * header->locale_size; return (const NLS_LOCALE_DATA *)((const char *)header + offset); } static inline unsigned int cp_mbstowcs_size( const CPTABLEINFO *info, const char *str, unsigned int len ) { unsigned int res; if (!info->DBCSCodePage) return len; for (res = 0; len; len--, str++, res++) { if (info->DBCSOffsets[(unsigned char)*str] && len > 1) { str++; len--; } } return res; } static inline unsigned int cp_wcstombs_size( const CPTABLEINFO *info, const WCHAR *str, unsigned int len ) { if (info->DBCSCodePage) { WCHAR *uni2cp = info->WideCharTable; unsigned int res; for (res = 0; len; len--, str++, res++) if (uni2cp[*str] & 0xff00) res++; return res; } else return len; } static inline NTSTATUS utf8_wcstombs_size( const WCHAR *src, unsigned int srclen, unsigned int *reslen ) { unsigned int val, len; NTSTATUS status = STATUS_SUCCESS; for (len = 0; srclen; srclen--, src++) { if (*src < 0x80) len++; /* 0x00-0x7f: 1 byte */ else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */ else { if (!get_utf16( src, srclen, &val )) { val = 0xfffd; status = STATUS_SOME_NOT_MAPPED; } if (val < 0x10000) len += 3; /* 0x800-0xffff: 3 bytes */ else /* 0x10000-0x10ffff: 4 bytes */ { len += 4; src++; srclen--; } } } *reslen = len; return status; } static inline NTSTATUS utf8_mbstowcs_size( const char *src, unsigned int srclen, unsigned int *reslen ) { unsigned int res, len; NTSTATUS status = STATUS_SUCCESS; const char *srcend = src + srclen; for (len = 0; src < srcend; len++) { unsigned char ch = *src++; if (ch < 0x80) continue; if ((res = decode_utf8_char( ch, &src, srcend )) > 0x10ffff) status = STATUS_SOME_NOT_MAPPED; else if (res > 0xffff) len++; } *reslen = len; return status; } static inline unsigned int cp_mbstowcs( const CPTABLEINFO *info, WCHAR *dst, unsigned int dstlen, const char *src, unsigned int srclen ) { unsigned int i, ret; if (info->DBCSOffsets) { for (i = dstlen; srclen && i; i--, srclen--, src++, dst++) { USHORT off = info->DBCSOffsets[(unsigned char)*src]; if (off && srclen > 1) { src++; srclen--; *dst = info->DBCSOffsets[off + (unsigned char)*src]; } else *dst = info->MultiByteTable[(unsigned char)*src]; } ret = dstlen - i; } else { ret = min( srclen, dstlen ); for (i = 0; i < ret; i++) dst[i] = info->MultiByteTable[(unsigned char)src[i]]; } return ret; } static inline unsigned int cp_wcstombs( const CPTABLEINFO *info, char *dst, unsigned int dstlen, const WCHAR *src, unsigned int srclen ) { unsigned int i, ret; if (info->DBCSCodePage) { const WCHAR *uni2cp = info->WideCharTable; for (i = dstlen; srclen && i; i--, srclen--, src++) { if (uni2cp[*src] & 0xff00) { if (i == 1) break; /* do not output a partial char */ i--; *dst++ = uni2cp[*src] >> 8; } *dst++ = (char)uni2cp[*src]; } ret = dstlen - i; } else { const char *uni2cp = info->WideCharTable; ret = min( srclen, dstlen ); for (i = 0; i < ret; i++) dst[i] = uni2cp[src[i]]; } return ret; } static inline NTSTATUS utf8_mbstowcs( WCHAR *dst, unsigned int dstlen, unsigned int *reslen, const char *src, unsigned int srclen ) { unsigned int res; NTSTATUS status = STATUS_SUCCESS; const char *srcend = src + srclen; WCHAR *dstend = dst + dstlen; while ((dst < dstend) && (src < srcend)) { unsigned char ch = *src++; if (ch < 0x80) /* special fast case for 7-bit ASCII */ { *dst++ = ch; continue; } if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff) { *dst++ = res; } else if (res <= 0x10ffff) /* we need surrogates */ { res -= 0x10000; *dst++ = 0xd800 | (res >> 10); if (dst == dstend) break; *dst++ = 0xdc00 | (res & 0x3ff); } else { *dst++ = 0xfffd; status = STATUS_SOME_NOT_MAPPED; } } if (src < srcend) status = STATUS_BUFFER_TOO_SMALL; /* overflow */ *reslen = dstlen - (dstend - dst); return status; } static inline NTSTATUS utf8_wcstombs( char *dst, unsigned int dstlen, unsigned int *reslen, const WCHAR *src, unsigned int srclen ) { char *end; unsigned int val; NTSTATUS status = STATUS_SUCCESS; for (end = dst + dstlen; srclen; srclen--, src++) { WCHAR ch = *src; if (ch < 0x80) /* 0x00-0x7f: 1 byte */ { if (dst > end - 1) break; *dst++ = ch; continue; } if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */ { if (dst > end - 2) break; dst[1] = 0x80 | (ch & 0x3f); ch >>= 6; dst[0] = 0xc0 | ch; dst += 2; continue; } if (!get_utf16( src, srclen, &val )) { val = 0xfffd; status = STATUS_SOME_NOT_MAPPED; } if (val < 0x10000) /* 0x800-0xffff: 3 bytes */ { if (dst > end - 3) break; dst[2] = 0x80 | (val & 0x3f); val >>= 6; dst[1] = 0x80 | (val & 0x3f); val >>= 6; dst[0] = 0xe0 | val; dst += 3; } else /* 0x10000-0x10ffff: 4 bytes */ { if (dst > end - 4) break; dst[3] = 0x80 | (val & 0x3f); val >>= 6; dst[2] = 0x80 | (val & 0x3f); val >>= 6; dst[1] = 0x80 | (val & 0x3f); val >>= 6; dst[0] = 0xf0 | val; dst += 4; src++; srclen--; } } if (srclen) status = STATUS_BUFFER_TOO_SMALL; *reslen = dstlen - (end - dst); return status; } #define HANGUL_SBASE 0xac00 #define HANGUL_LBASE 0x1100 #define HANGUL_VBASE 0x1161 #define HANGUL_TBASE 0x11a7 #define HANGUL_LCOUNT 19 #define HANGUL_VCOUNT 21 #define HANGUL_TCOUNT 28 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) #define HANGUL_SCOUNT (HANGUL_LCOUNT * HANGUL_NCOUNT) static inline const WCHAR *get_decomposition( const struct norm_table *info, unsigned int ch, BYTE props, WCHAR *buffer, unsigned int *ret_len ) { const struct pair { WCHAR src; USHORT dst; } *pairs; const USHORT *hash_table = (const USHORT *)info + info->decomp_hash; const WCHAR *ret; unsigned int i, pos, end, len, hash; /* default to no decomposition */ put_utf16( buffer, ch ); *ret_len = 1 + (ch >= 0x10000); if (!props || props == 0x7f) return buffer; if (props == 0xff) /* Hangul or invalid char */ { if (ch >= HANGUL_SBASE && ch < HANGUL_SBASE + HANGUL_SCOUNT) { unsigned short sindex = ch - HANGUL_SBASE; unsigned short tindex = sindex % HANGUL_TCOUNT; buffer[0] = HANGUL_LBASE + sindex / HANGUL_NCOUNT; buffer[1] = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT; if (tindex) buffer[2] = HANGUL_TBASE + tindex; *ret_len = 2 + !!tindex; return buffer; } /* ignore other chars in Hangul range */ if (ch >= HANGUL_LBASE && ch < HANGUL_LBASE + 0x100) return buffer; if (ch >= HANGUL_SBASE && ch < HANGUL_SBASE + 0x2c00) return buffer; return NULL; } hash = ch % info->decomp_size; pos = hash_table[hash]; if (pos >> 13) { if (props != 0xbf) return buffer; ret = (const USHORT *)info + info->decomp_seq + (pos & 0x1fff); len = pos >> 13; } else { pairs = (const struct pair *)((const USHORT *)info + info->decomp_map); /* find the end of the hash bucket */ for (i = hash + 1; i < info->decomp_size; i++) if (!(hash_table[i] >> 13)) break; if (i < info->decomp_size) end = hash_table[i]; else for (end = pos; pairs[end].src; end++) ; for ( ; pos < end; pos++) { if (pairs[pos].src != (WCHAR)ch) continue; ret = (const USHORT *)info + info->decomp_seq + (pairs[pos].dst & 0x1fff); len = pairs[pos].dst >> 13; break; } if (pos >= end) return buffer; } if (len == 7) while (ret[len]) len++; if (!ret[0]) len = 0; /* ignored char */ *ret_len = len; return ret; } static inline BYTE rol( BYTE val, BYTE count ) { return (val << count) | (val >> (8 - count)); } static inline BYTE get_char_props( const struct norm_table *info, unsigned int ch ) { const BYTE *level1 = (const BYTE *)((const USHORT *)info + info->props_level1); const BYTE *level2 = (const BYTE *)((const USHORT *)info + info->props_level2); BYTE off = level1[ch / 128]; if (!off || off >= 0xfb) return rol( off, 5 ); return level2[(off - 1) * 128 + ch % 128]; } static inline BYTE get_combining_class( const struct norm_table *info, unsigned int c ) { const BYTE *classes = (const BYTE *)((const USHORT *)info + info->classes); BYTE class = get_char_props( info, c ) & 0x3f; if (class == 0x3f) return 0; return classes[class]; } static inline BOOL reorderable_pair( const struct norm_table *info, unsigned int c1, unsigned int c2 ) { BYTE ccc1, ccc2; /* reorderable if ccc1 > ccc2 > 0 */ ccc1 = get_combining_class( info, c1 ); if (ccc1 < 2) return FALSE; ccc2 = get_combining_class( info, c2 ); return ccc2 && (ccc1 > ccc2); } static inline void canonical_order_substring( const struct norm_table *info, WCHAR *str, unsigned int len ) { unsigned int i, ch1, ch2, len1, len2; BOOL swapped; do { swapped = FALSE; for (i = 0; i < len - 1; i += len1) { if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break; if (i + len1 >= len) break; if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break; if (reorderable_pair( info, ch1, ch2 )) { WCHAR tmp[2]; memcpy( tmp, str + i, len1 * sizeof(WCHAR) ); memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) ); memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) ); swapped = TRUE; i += len2 - len1; } } } while (swapped); } /* reorder the string into canonical order - D108/D109 */ static inline void canonical_order_string( const struct norm_table *info, WCHAR *str, unsigned int len ) { unsigned int ch, i, r, next = 0; for (i = 0; i < len; i += r) { if (!(r = get_utf16( str + i, len - i, &ch ))) return; if (i && !get_combining_class( info, ch )) { if (i > next + 1) /* at least two successive non-starters */ canonical_order_substring( info, str + next, i - next ); next = i + r; } } if (i > next + 1) canonical_order_substring( info, str + next, i - next ); } static inline NTSTATUS decompose_string( const struct norm_table *info, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len ) { BYTE props; int src_pos, dst_pos; unsigned int ch, len, decomp_len; WCHAR buffer[3]; const WCHAR *decomp; for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len) { if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch ))) { *dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] ); return STATUS_NO_UNICODE_TRANSLATION; } props = get_char_props( info, ch ); if (!(decomp = get_decomposition( info, ch, props, buffer, &decomp_len ))) { /* allow final null */ if (!ch && src_pos == src_len - 1 && dst_pos < *dst_len) { dst[dst_pos++] = 0; break; } *dst_len = src_pos; return STATUS_NO_UNICODE_TRANSLATION; } if (dst_pos + decomp_len > *dst_len) { *dst_len += (src_len - src_pos) * info->len_factor; return STATUS_BUFFER_TOO_SMALL; } memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) ); dst_pos += decomp_len; } canonical_order_string( info, dst, dst_pos ); *dst_len = dst_pos; return STATUS_SUCCESS; } static inline unsigned int compose_hangul( unsigned int ch1, unsigned int ch2 ) { if (ch1 >= HANGUL_LBASE && ch1 < HANGUL_LBASE + HANGUL_LCOUNT) { int lindex = ch1 - HANGUL_LBASE; int vindex = ch2 - HANGUL_VBASE; if (vindex >= 0 && vindex < HANGUL_VCOUNT) return HANGUL_SBASE + (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT; } if (ch1 >= HANGUL_SBASE && ch1 < HANGUL_SBASE + HANGUL_SCOUNT) { int sindex = ch1 - HANGUL_SBASE; if (!(sindex % HANGUL_TCOUNT)) { int tindex = ch2 - HANGUL_TBASE; if (tindex > 0 && tindex < HANGUL_TCOUNT) return ch1 + tindex; } } return 0; } static inline unsigned int compose_chars( const struct norm_table *info, unsigned int ch1, unsigned int ch2 ) { const USHORT *table = (const USHORT *)info + info->comp_hash; const WCHAR *chars = (const USHORT *)info + info->comp_seq; unsigned int hash, start, end, i, len, ch[3]; hash = (ch1 + 95 * ch2) % info->comp_size; start = table[hash]; end = table[hash + 1]; while (start < end) { for (i = 0; i < 3; i++, start += len) len = get_utf16( chars + start, end - start, ch + i ); if (ch[0] == ch1 && ch[1] == ch2) return ch[2]; } return 0; } static inline unsigned int compose_string( const struct norm_table *info, WCHAR *str, unsigned int srclen ) { unsigned int i, ch, comp, len, start_ch = 0, last_starter = srclen; BYTE class, prev_class = 0; for (i = 0; i < srclen; i += len) { if (!(len = get_utf16( str + i, srclen - i, &ch ))) return 0; class = get_combining_class( info, ch ); if (last_starter == srclen || (prev_class && prev_class >= class) || (!(comp = compose_hangul( start_ch, ch )) && !(comp = compose_chars( info, start_ch, ch )))) { if (!class) { last_starter = i; start_ch = ch; } prev_class = class; } else { int comp_len = 1 + (comp >= 0x10000); int start_len = 1 + (start_ch >= 0x10000); if (comp_len != start_len) memmove( str + last_starter + comp_len, str + last_starter + start_len, (i - (last_starter + start_len)) * sizeof(WCHAR) ); memmove( str + i + comp_len - start_len, str + i + len, (srclen - i - len) * sizeof(WCHAR) ); srclen += comp_len - start_len - len; start_ch = comp; i = last_starter; len = comp_len; prev_class = 0; put_utf16( str + i, comp ); } } return srclen; } #endif /* __NTDLL_LOCALE_PRIVATE_H */