sort: test against all month formats in month-sort

The CLDR specification [1] defines three possible month formats:

- Abbreviation (e.g Jan, Ιαν)
- Full (e.g January, Ιανουαρίου)
- Standalone (e.g January, Ιανουάριος)

Many languages use different case endings depending on whether the month
is referenced as a standalone word (nominative case), or in date context
(genitive, partitive, etc.). sort(1)'s -M option currently sorts months
by testing input against only the abbrevation format, which is
essentially a substring of the full format. While this works fine for
languages like English, where there are no cases, for languages where
there is a different case ending between the abbreviation/full and
standalone formats, it is not sufficient.

For example, in Greek, "May" can take the following forms:

Abbreviation: Μαΐ (genitive case)
Full: Μαΐου (genitive case)
Standalone: Μάιος (nominative case)

If we use the standalone format in Greek, sort(1) will not able to match
"Μαΐ" to "Μάιος" and the sort will fail.

This change makes sort(1) test against all three formats. It also works
when the input contains mixed formats.

[1] https://cldr.unicode.org/translation/date-time/date-time-patterns

Reviewed by:	markj
MFC after:	2 weeks
Differential Revision:	https://reviews.freebsd.org/D42847
This commit is contained in:
Christos Margiolis 2023-12-01 02:30:10 +02:00
parent f42518ff12
commit 3d44dce90a
4 changed files with 262 additions and 44 deletions

View file

@ -43,63 +43,114 @@
bool byte_sort;
static wchar_t **wmonths;
static char **cmonths;
struct wmonth {
wchar_t *mon;
wchar_t *ab;
wchar_t *alt;
};
/* initialise months */
struct cmonth {
char *mon;
char *ab;
char *alt;
};
static struct wmonth *wmonths;
static struct cmonth *cmonths;
static int
populate_cmonth(char **field, const nl_item item, int idx)
{
char *tmp, *m;
size_t i, len;
tmp = nl_langinfo(item);
if (debug_sort)
printf("month[%d]=%s\n", idx, tmp);
if (*tmp == '\0')
return (0);
m = sort_strdup(tmp);
len = strlen(tmp);
for (i = 0; i < len; i++)
m[i] = toupper(m[i]);
*field = m;
return (1);
}
static int
populate_wmonth(wchar_t **field, const nl_item item, int idx)
{
wchar_t *m;
char *tmp;
size_t i, len;
tmp = nl_langinfo(item);
if (debug_sort)
printf("month[%d]=%s\n", idx, tmp);
if (*tmp == '\0')
return (0);
len = strlen(tmp);
m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1));
if (mbstowcs(m, tmp, len) == ((size_t) - 1)) {
sort_free(m);
return (0);
}
m[len] = L'\0';
for (i = 0; i < len; i++)
m[i] = towupper(m[i]);
*field = m;
return (1);
}
void
initialise_months(void)
{
const nl_item item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4,
const nl_item mon_item[12] = { MON_1, MON_2, MON_3, MON_4,
MON_5, MON_6, MON_7, MON_8, MON_9, MON_10,
MON_11, MON_12 };
const nl_item ab_item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4,
ABMON_5, ABMON_6, ABMON_7, ABMON_8, ABMON_9, ABMON_10,
ABMON_11, ABMON_12 };
char *tmp;
size_t len;
const nl_item alt_item[12] = { ALTMON_1, ALTMON_2, ALTMON_3, ALTMON_4,
ALTMON_5, ALTMON_6, ALTMON_7, ALTMON_8, ALTMON_9, ALTMON_10,
ALTMON_11, ALTMON_12 };
int i;
/*
* Handle all possible month formats: abbrevation, full name,
* standalone name (without case ending).
*/
if (mb_cur_max == 1) {
if (cmonths == NULL) {
char *m;
cmonths = sort_malloc(sizeof(char*) * 12);
for (int i = 0; i < 12; i++) {
cmonths[i] = NULL;
tmp = nl_langinfo(item[i]);
if (debug_sort)
printf("month[%d]=%s\n", i, tmp);
if (*tmp == '\0')
cmonths = sort_malloc(sizeof(struct cmonth) * 12);
for (i = 0; i < 12; i++) {
if (!populate_cmonth(&cmonths[i].mon,
mon_item[i], i))
continue;
if (!populate_cmonth(&cmonths[i].ab,
ab_item[i], i))
continue;
if (!populate_cmonth(&cmonths[i].alt,
alt_item[i], i))
continue;
m = sort_strdup(tmp);
len = strlen(tmp);
for (unsigned int j = 0; j < len; j++)
m[j] = toupper(m[j]);
cmonths[i] = m;
}
}
} else {
if (wmonths == NULL) {
wchar_t *m;
wmonths = sort_malloc(sizeof(wchar_t *) * 12);
for (int i = 0; i < 12; i++) {
wmonths[i] = NULL;
tmp = nl_langinfo(item[i]);
if (debug_sort)
printf("month[%d]=%s\n", i, tmp);
if (*tmp == '\0')
wmonths = sort_malloc(sizeof(struct wmonth) * 12);
for (i = 0; i < 12; i++) {
if (!populate_wmonth(&wmonths[i].mon,
mon_item[i], i))
continue;
len = strlen(tmp);
m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1));
if (mbstowcs(m, tmp, len) ==
((size_t) - 1)) {
sort_free(m);
if (!populate_wmonth(&wmonths[i].ab,
ab_item[i], i))
continue;
if (!populate_wmonth(&wmonths[i].alt,
alt_item[i], i))
continue;
}
m[len] = L'\0';
for (unsigned int j = 0; j < len; j++)
m[j] = towupper(m[j]);
wmonths[i] = m;
}
}
}
@ -754,8 +805,11 @@ bws_month_score(const struct bwstring *s0)
++s;
for (int i = 11; i >= 0; --i) {
if (cmonths[i] &&
(s == strstr(s, cmonths[i])))
if (cmonths[i].mon && (s == strstr(s, cmonths[i].mon)))
return (i);
if (cmonths[i].ab && (s == strstr(s, cmonths[i].ab)))
return (i);
if (cmonths[i].alt && (s == strstr(s, cmonths[i].alt)))
return (i);
}
@ -769,7 +823,11 @@ bws_month_score(const struct bwstring *s0)
++s;
for (int i = 11; i >= 0; --i) {
if (wmonths[i] && (s == wcsstr(s, wmonths[i])))
if (wmonths[i].ab && (s == wcsstr(s, wmonths[i].ab)))
return (i);
if (wmonths[i].mon && (s == wcsstr(s, wmonths[i].mon)))
return (i);
if (wmonths[i].alt && (s == wcsstr(s, wmonths[i].alt)))
return (i);
}
}

View file

@ -30,7 +30,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.Dd September 4, 2019
.Dd November 30, 2023
.Dt SORT 1
.Os
.Sh NAME
@ -179,7 +179,7 @@ options (human-readable).
.It Fl i , Fl Fl ignore-nonprinting
Ignore all non-printable characters.
.It Fl M , Fl Fl month-sort , Fl Fl sort=month
Sort by month abbreviations.
Sort by month.
Unknown strings are considered smaller than the month names.
.It Fl n , Fl Fl numeric-sort , Fl Fl sort=numeric
Sort fields numerically by arithmetic value.

View file

@ -2,6 +2,7 @@
PACKAGE= tests
NETBSD_ATF_TESTS_SH= sort_test
ATF_TESTS_SH= sort_monthsort_test
${PACKAGE}FILES+= d_any_char_dflag_out.txt
${PACKAGE}FILES+= d_any_char_fflag_out.txt

View file

@ -0,0 +1,159 @@
#
# SPDX-License-Identifier: BSD-2-Clause
#
# Copyright (c) 2023 Christos Margiolis <christos@FreeBSD.org>
#
get_months_fmt()
{
rm -f in
for i in $(seq 12 1); do
printf "2000-%02d-01\n" ${i} | xargs -I{} \
date -jf "%Y-%m-%d" {} "${1}" >>in
done
}
atf_test_case monthsort_english
monthsort_english_head()
{
atf_set "descr" "Test the -M flag with English months"
}
monthsort_english_body()
{
export LC_TIME="en_US.UTF-8"
cat >expout <<EOF
January
February
March
April
May
June
July
August
September
October
November
December
EOF
# No need to test the rest of the formats (%b and %OB) as %b is a
# substring of %B and %OB is the same as %B.
get_months_fmt '+%B'
atf_check -o file:expout sort -M in
}
atf_test_case monthsort_all_formats_greek
monthsort_all_formats_greek_head()
{
atf_set "descr" "Test the -M flag with all possible Greek month formats"
}
monthsort_all_formats_greek_body()
{
# Test with the Greek locale, since, unlike English, the
# abbreviation/full-name and standalone formats are different.
export LC_TIME="el_GR.UTF-8"
# Abbreviation format (e.g Jan, Ιαν)
cat >expout <<EOF
Ιαν
Φεβ
Μαρ
Απρ
Μαΐ
Ιουν
Ιουλ
Αυγ
Σεπ
Οκτ
Νοε
Δεκ
EOF
get_months_fmt '+%b'
atf_check -o file:expout sort -M in
# Full-name format (e.g January, Ιανουαρίου)
cat >expout <<EOF
Ιανουαρίου
Φεβρουαρίου
Μαρτίου
Απριλίου
Μαΐου
Ιουνίου
Ιουλίου
Αυγούστου
Σεπτεμβρίου
Οκτωβρίου
Νοεμβρίου
Δεκεμβρίου
EOF
get_months_fmt '+%B'
atf_check -o file:expout sort -M in
# Standalone format (e.g January, Ιανουάριος)
cat >expout <<EOF
Ιανουάριος
Φεβρουάριος
Μάρτιος
Απρίλιος
Μάϊος
Ιούνιος
Ιούλιος
Αύγουστος
Σεπτέμβριος
Οκτώβριος
Νοέμβριος
Δεκέμβριος
EOF
get_months_fmt '+%OB'
atf_check -o file:expout sort -M in
}
atf_test_case monthsort_mixed_formats_greek
monthsort_mixed_formats_greek_head()
{
atf_set "descr" "Test the -M flag with mixed Greek month formats"
}
monthsort_mixed_formats_greek_body()
{
export LC_TIME="el_GR.UTF-8"
cat >in <<EOF
Δεκέμβριος
Νοεμβρίου
Οκτ
Σεπ
Αυγ
Ιούλιος
Ιουνίου
Μαΐου
Απριλίου
Μάρτιος
Φεβρουάριος
Ιανουάριος
EOF
cat >expout <<EOF
Ιανουάριος
Φεβρουάριος
Μάρτιος
Απριλίου
Μαΐου
Ιουνίου
Ιούλιος
Αυγ
Σεπ
Οκτ
Νοεμβρίου
Δεκέμβριος
EOF
atf_check -o file:expout sort -M in
}
atf_init_test_cases()
{
atf_add_test_case monthsort_english
atf_add_test_case monthsort_all_formats_greek
atf_add_test_case monthsort_mixed_formats_greek
}