LibUnicode: Parse and generate case folding code point data

Case folding rules have a similar mapping style as special casing rules,
where one code point may map to zero or more case folding rules. These
will be used for case-insensitive string comparisons. To see how case
folding can differ from other casing rules, consider "ß" (U+00DF):

    >>> "ß".lower()
    'ß'

    >>> "ß".upper()
    'SS'

    >>> "ß".title()
    'Ss'

    >>> "ß".casefold()
    'ss'
This commit is contained in:
Timothy Flynn 2023-01-17 08:34:38 -05:00 committed by Linus Groh
parent 9226cf7272
commit 8f2589b3b0
7 changed files with 207 additions and 5 deletions

View file

@ -13,6 +13,9 @@ set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
set(SPECIAL_CASING_SOURCE "SpecialCasing.txt")
set(SPECIAL_CASING_PATH "${UCD_PATH}/${SPECIAL_CASING_SOURCE}")
set(CASE_FOLDING_SOURCE "CaseFolding.txt")
set(CASE_FOLDING_PATH "${UCD_PATH}/${CASE_FOLDING_SOURCE}")
set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt")
set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}")
@ -71,6 +74,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SPECIAL_CASING_SOURCE}" "${SPECIAL_CASING_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${CASE_FOLDING_SOURCE}" "${CASE_FOLDING_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}")
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}")
@ -105,7 +109,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
"${UCD_VERSION_FILE}"
"${UNICODE_DATA_HEADER}"
"${UNICODE_DATA_IMPLEMENTATION}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -o "${CASE_FOLDING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -k "${BLOCKS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}"
)
invoke_generator(
"EmojiData"

View file

@ -41,6 +41,13 @@ struct SpecialCasing {
DeprecatedString condition;
};
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
struct CaseFolding {
u32 code_point { 0 };
StringView status { "Common"sv };
Vector<u32> mapping { 0 };
};
// https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
struct CodePointDecomposition {
// `tag` is a string since it's used for codegen as an enum value.
@ -90,6 +97,7 @@ struct CodePointData {
Optional<u32> simple_lowercase_mapping;
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<u32> case_folding_indices;
};
struct BlockName {
@ -117,6 +125,12 @@ struct UnicodeData {
Vector<DeprecatedString> conditions;
Vector<DeprecatedString> locales;
Vector<CaseFolding> case_folding;
u32 code_points_with_case_folding { 0 };
u32 largest_case_folding_mapping_size { 0 };
u32 largest_case_folding_size { 0 };
Vector<StringView> statuses;
Vector<CodePointData> code_point_data;
HashMap<u32, size_t> code_point_abbreviations;
@ -276,6 +290,54 @@ static ErrorOr<void> parse_special_casing(Core::Stream::BufferedFile& file, Unic
return {};
}
static ErrorOr<void> parse_case_folding(Core::Stream::BufferedFile& file, UnicodeData& unicode_data)
{
Array<u8, 1024> buffer;
while (TRY(file.can_read_line())) {
auto line = TRY(file.read_line(buffer));
if (line.is_empty() || line.starts_with('#'))
continue;
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
VERIFY(segments.size() == 4);
CaseFolding folding {};
folding.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
folding.mapping = parse_code_point_list(segments[2]);
switch (segments[1].trim_whitespace()[0]) {
case 'C':
folding.status = "Common"sv;
break;
case 'F':
folding.status = "Full"sv;
break;
case 'S':
folding.status = "Simple"sv;
break;
case 'T':
folding.status = "Special"sv;
break;
}
unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size());
if (!unicode_data.statuses.contains_slow(folding.status))
unicode_data.statuses.append(folding.status);
unicode_data.case_folding.append(move(folding));
}
quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) {
if (lhs.code_point != rhs.code_point)
return lhs.code_point < rhs.code_point;
return lhs.status < rhs.status;
});
return {};
}
static ErrorOr<void> parse_prop_list(Core::Stream::BufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false)
{
Array<u8, 1024> buffer;
@ -667,6 +729,14 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
}
}
bool has_case_folding { false };
for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) {
data.case_folding_indices.append(i);
has_case_folding = true;
}
}
unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0;
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
@ -675,8 +745,11 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
unicode_data.code_points_with_special_casing += has_special_casing;
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
previous_code_point = data.code_point;
unicode_data.code_points_with_case_folding += has_case_folding;
unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size());
previous_code_point = data.code_point;
unicode_data.code_point_data.append(move(data));
}
@ -688,8 +761,9 @@ static ErrorOr<void> generate_unicode_data_header(Core::Stream::BufferedFile& fi
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("special_casing_mapping_size", DeprecatedString::number(unicode_data.largest_special_casing_mapping_size));
generator.set("case_folding_mapping_size", DeprecatedString::number(unicode_data.largest_case_folding_mapping_size));
auto generate_enum = [&](StringView name, StringView default_, Vector<DeprecatedString> values, Vector<Alias> aliases = {}) {
auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
quick_sort(values);
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
@ -737,6 +811,7 @@ namespace Unicode {
generate_enum("Locale"sv, "None"sv, unicode_data.locales);
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses));
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
@ -763,6 +838,14 @@ struct SpecialCasing {
Condition condition { Condition::None };
};
struct CaseFolding {
u32 code_point { 0 };
CaseFoldingStatus status { CaseFoldingStatus::Common };
u32 mapping[@case_folding_mapping_size@];
u32 mapping_size { 0 };
};
struct CodePointDecompositionRaw {
u32 code_point { 0 };
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
@ -779,6 +862,7 @@ struct CodePointDecomposition {
Optional<Locale> locale_from_string(StringView locale);
Span<SpecialCasing const* const> special_case_mapping(u32 code_point);
Span<CaseFolding const* const> case_folding_mapping(u32 code_point);
}
)~~~");
@ -795,6 +879,8 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
generator.set("largest_special_casing_size", DeprecatedString::number(unicode_data.largest_special_casing_size));
generator.set("special_casing_size", DeprecatedString::number(unicode_data.special_casing.size()));
generator.set("largest_case_folding_size", DeprecatedString::number(unicode_data.largest_case_folding_size));
generator.set("case_folding_size", DeprecatedString::number(unicode_data.case_folding.size()));
generator.append(R"~~~(
#include <AK/Array.h>
@ -830,7 +916,7 @@ namespace Unicode {
};
generator.append(R"~~~(
static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing { {)~~~");
static constexpr Array<SpecialCasing, @special_casing_size@> s_special_case { {)~~~");
for (auto const& casing : unicode_data.special_casing) {
generator.set("code_point", DeprecatedString::formatted("{:#x}", casing.code_point));
@ -854,6 +940,21 @@ static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing {
generator.append(R"~~~(
} };
static constexpr Array<CaseFolding, @case_folding_size@> s_case_folding { {)~~~");
for (auto const& folding : unicode_data.case_folding) {
generator.set("code_point", DeprecatedString::formatted("{:#x}", folding.code_point));
generator.set("status", folding.status);
generator.append(R"~~~(
{ @code_point@, CaseFoldingStatus::@status@)~~~");
append_list_and_size(folding.mapping, "0x{:x}"sv);
generator.append(" },");
}
generator.append(R"~~~(
} };
struct CodePointMapping {
u32 code_point { 0 };
u32 mapping { 0 };
@ -865,6 +966,12 @@ struct SpecialCaseMapping {
u32 special_casing_size { 0 };
};
struct CaseFoldingMapping {
u32 code_point { 0 };
Array<CaseFolding const*, @largest_case_folding_size@> case_folding {};
u32 case_folding_size { 0 };
};
struct CodePointAbbreviation {
u32 code_point { 0 };
@string_index_type@ abbreviation { 0 };
@ -953,7 +1060,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
generator.set("size", DeprecatedString::number(mapping->decomposition_size));
generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
} else {
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
append_list_and_size(mapping, "&s_@name@[{}]"sv);
generator.append(" },");
}
@ -977,6 +1084,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; });
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; });
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping,
@ -1148,6 +1256,15 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
return mapping->special_casing.span().slice(0, mapping->special_casing_size);
}
Span<CaseFolding const* const> case_folding_mapping(u32 code_point)
{
auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator<CaseFoldingMapping> {});
if (mapping == nullptr)
return {};
return mapping->case_folding.span().slice(0, mapping->case_folding_size);
}
Optional<StringView> code_point_abbreviation(u32 code_point)
{
auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
@ -1373,6 +1490,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
StringView generated_implementation_path;
StringView unicode_data_path;
StringView special_casing_path;
StringView case_folding_path;
StringView derived_general_category_path;
StringView prop_list_path;
StringView derived_core_prop_path;
@ -1394,6 +1512,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path");
args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
@ -1416,6 +1535,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::Stream::OpenMode::Read));
auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::Stream::OpenMode::Read));
auto special_casing_file = TRY(open_file(special_casing_path, Core::Stream::OpenMode::Read));
auto case_folding_file = TRY(open_file(case_folding_path, Core::Stream::OpenMode::Read));
auto prop_list_file = TRY(open_file(prop_list_path, Core::Stream::OpenMode::Read));
auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::Stream::OpenMode::Read));
auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::Stream::OpenMode::Read));
@ -1433,6 +1553,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
UnicodeData unicode_data {};
TRY(parse_special_casing(*special_casing_file, unicode_data));
TRY(parse_case_folding(*case_folding_file, unicode_data));
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));

View file

@ -97,6 +97,33 @@ TEST_CASE(to_unicode_titlecase)
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
}
TEST_CASE(to_unicode_casefold)
{
for (u8 code_point = 0; code_point < 0x80; ++code_point) {
auto ascii = tolower(code_point);
auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast<char const*>(&code_point), 1 }));
EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u);
EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii);
}
// LATIN SMALL LETTER SHARP S
auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv));
EXPECT_EQ(result, "\u0073\u0073"sv);
// GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv));
EXPECT_EQ(result, "\u03B1\u03B9"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI
result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv));
EXPECT_EQ(result, "\u03B1\u0342"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv));
EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv);
}
TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S

View file

@ -61,6 +61,13 @@ ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView
return builder.to_string();
}
ErrorOr<String> to_unicode_casefold_full(StringView string)
{
StringBuilder builder;
TRY(Detail::build_casefold_string(Utf8View { string }, builder));
return builder.to_string();
}
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }

View file

@ -44,6 +44,7 @@ u32 to_unicode_titlecase(u32 code_point);
ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {});
ErrorOr<String> to_unicode_casefold_full(StringView);
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);

View file

@ -195,6 +195,19 @@ static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View
return nullptr;
}
template<CaseFoldingStatus... StatusFilter>
static CaseFolding const* find_matching_case_folding(u32 code_point)
{
auto case_foldings = case_folding_mapping(code_point);
for (auto const* case_folding : case_foldings) {
if (((case_folding->status == StatusFilter) || ...))
return case_folding;
}
return nullptr;
}
#endif
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
@ -314,4 +327,32 @@ ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma
#endif
}
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
ErrorOr<void> build_casefold_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder)
{
#if ENABLE_UNICODE_DATA
// toCasefold(X): Map each character C in X to Case_Folding(C).
//
// Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
// CaseFolding.txt in the Unicode Character Database.
using enum CaseFoldingStatus;
for (auto code_point : code_points) {
auto const* case_folding = find_matching_case_folding<Common, Full>(code_point);
if (!case_folding) {
TRY(builder.try_append_code_point(code_point));
continue;
}
for (size_t i = 0; i < case_folding->mapping_size; ++i)
TRY(builder.try_append_code_point(case_folding->mapping[i]));
}
return {};
#else
return Error::from_string_literal("Unicode data has been disabled");
#endif
}
}

View file

@ -17,5 +17,6 @@ namespace Unicode::Detail {
ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder);
}