LibUnicode: Canonicalize calendar subtags

Calendar subtags are a bit of an odd-man-out in that we must match the
variants "ethiopic-amete-alem" in that order, without any other variant
in the locale. So a separate method is needed for this, and we now defer
sorting the variant list until after other canonicalization is done.
This commit is contained in:
Timothy Flynn 2021-08-31 12:19:19 -04:00 committed by Linus Groh
parent 8458f477a4
commit da89cf9afb
2 changed files with 35 additions and 6 deletions

View file

@ -320,6 +320,14 @@ TEST_CASE(canonicalize_unicode_locale_id)
test("EN-U-TZ-HONGKONG"sv, "en-u-tz-hkhkg"sv);
test("en-u-ta-hongkong"sv, "en-u-ta-hongkong"sv);
test("EN-U-TA-HONGKONG"sv, "en-u-ta-hongkong"sv);
test("en-u-ca-ethiopic-amete-alem"sv, "en-u-ca-ethioaa"sv);
test("EN-U-CA-ETHIOPIC-AMETE-ALEM"sv, "en-u-ca-ethioaa"sv);
test("en-u-ca-alem-ethiopic-amete"sv, "en-u-ca-alem-ethiopic-amete"sv);
test("EN-U-CA-ALEM-ETHIOPIC-AMETE"sv, "en-u-ca-alem-ethiopic-amete"sv);
test("en-u-ca-ethiopic-amete-xxx-alem"sv, "en-u-ca-ethiopic-amete-xxx-alem"sv);
test("EN-U-CA-ETHIOPIC-AMETE-XXX-ALEM"sv, "en-u-ca-ethiopic-amete-xxx-alem"sv);
test("en-u-cb-ethiopic-amete-alem"sv, "en-u-cb-ethiopic-amete-alem"sv);
test("EN-U-CB-ETHIOPIC-AMETE-ALEM"sv, "en-u-cb-ethiopic-amete-alem"sv);
test("en-t-en"sv, "en-t-en"sv);
test("EN-T-EN"sv, "en-t-en"sv);
@ -345,6 +353,8 @@ TEST_CASE(canonicalize_unicode_locale_id)
test("EN-T-K1-IMPERIAL"sv, "en-t-k1-imperial"sv);
test("en-t-k1-hongkong"sv, "en-t-k1-hongkong"sv);
test("EN-T-K1-HONGKONG"sv, "en-t-k1-hongkong"sv);
test("en-t-k1-ethiopic-amete-alem"sv, "en-t-k1-ethiopic-amete-alem"sv);
test("EN-T-K1-ETHIOPIC-AMETE-ALEM"sv, "en-t-k1-ethiopic-amete-alem"sv);
test("en-0-aaa"sv, "en-0-aaa"sv);
test("EN-0-AAA"sv, "en-0-aaa"sv);

View file

@ -483,6 +483,7 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
static void perform_hard_coded_key_value_substitutions(String& key, String& value)
{
// FIXME: In the XML export of CLDR, there are some aliases defined in the following files:
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/collation.xml
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/measure.xml
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/timezone.xml
@ -490,7 +491,9 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
//
// There doesn't seem to be a counterpart in the JSON export. Since there aren't many such
// aliases, until an XML parser is implemented, those aliases are implemented here.
if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
if ((key == "ca"sv) && (value == "islamicc"sv)) {
value = "islamic-civil"sv;
} else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
value = "true"sv;
} else if (key == "ks"sv) {
if (value == "primary"sv)
@ -540,6 +543,20 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
}
}
static void perform_hard_coded_key_multi_value_substitutions(String const& key, Vector<String>& values)
{
// Similar to perform_hard_coded_key_value_substitutions, some aliases depend on multiple
// variants being present in the original locale. Those are canonicalized separately here.
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
if ((key != "ca"sv) || (values.size() != 3))
return;
static Vector<String> ethiopic_amete_alem { "ethiopic"sv, "amete"sv, "alem"sv };
if (values == ethiopic_amete_alem)
values = { "ethioaa"sv };
}
static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
{
auto canonicalize_language = [](LanguageID& language_id, bool force_lowercase) {
@ -626,6 +643,8 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
values.append(move(value));
}
perform_hard_coded_key_multi_value_substitutions(key, values);
};
canonicalize_language(locale_id.language_id, false);
@ -644,22 +663,22 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
for (auto& extension : locale_id.extensions) {
extension.visit(
[&](LocaleExtension& ext) {
quick_sort(ext.attributes);
quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
for (auto& attribute : ext.attributes)
attribute = attribute.to_lowercase();
for (auto& keyword : ext.keywords)
canonicalize_key_value_list(keyword.key, keyword.types, true);
quick_sort(ext.attributes);
quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
},
[&](TransformedExtension& ext) {
if (ext.language.has_value())
canonicalize_language(*ext.language, true);
quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
for (auto& field : ext.fields)
canonicalize_key_value_list(field.key, field.values, false);
quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
},
[&](OtherExtension& ext) {
ext.key = static_cast<char>(to_ascii_lowercase(ext.key));