Merge pull request #28146 from keszybz/ansi-seq-skip

Pass ANSI sequences through when ellipsizing output
This commit is contained in:
Lennart Poettering 2023-06-28 19:18:27 +02:00 committed by GitHub
commit 4d8eca03e5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 206 additions and 40 deletions

View file

@ -295,6 +295,62 @@ static int write_ellipsis(char *buf, bool unicode) {
return 3;
}
static size_t ansi_sequence_length(const char *s, size_t len) {
assert(s);
if (len < 2)
return 0;
if (s[0] != 0x1B) /* ASCII 27, aka ESC, aka Ctrl-[ */
return 0; /* Not the start of a sequence */
if (s[1] == 0x5B) { /* [, start of CSI sequence */
size_t i = 2;
if (i == len)
return 0;
while (s[i] >= 0x30 && s[i] <= 0x3F) /* Parameter bytes */
if (++i == len)
return 0;
while (s[i] >= 0x20 && s[i] <= 0x2F) /* Intermediate bytes */
if (++i == len)
return 0;
if (s[i] >= 0x40 && s[i] <= 0x7E) /* Final byte */
return i + 1;
return 0; /* Bad sequence */
} else if (s[1] >= 0x40 && s[1] <= 0x5F) /* other non-CSI Fe sequence */
return 2;
return 0; /* Bad escape? */
}
static bool string_has_ansi_sequence(const char *s, size_t len) {
const char *t = s;
while ((t = memchr(s, 0x1B, len - (t - s))))
if (ansi_sequence_length(t, len - (t - s)) > 0)
return true;
return false;
}
static size_t previous_ansi_sequence(const char *s, size_t length, const char **ret_where) {
/* Locate the previous ANSI sequence and save its start in *ret_where and return length. */
for (size_t i = length - 2; i > 0; i--) { /* -2 because at least two bytes are needed */
size_t slen = ansi_sequence_length(s + (i - 1), length - (i - 1));
if (slen == 0)
continue;
*ret_where = s + (i - 1);
return slen;
}
*ret_where = NULL;
return 0;
}
static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
size_t x, need_space, suffix_len;
char *t;
@ -354,7 +410,6 @@ static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_le
char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
size_t x, k, len, len2;
const char *i, *j;
char *e;
int r;
/* Note that 'old_length' refers to bytes in the string, while 'new_length' refers to character cells taken up
@ -378,73 +433,117 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne
if (new_length == 0)
return strdup("");
/* If no multibyte characters use ascii_ellipsize_mem for speed */
if (ascii_is_valid_n(s, old_length))
bool has_ansi_seq = string_has_ansi_sequence(s, old_length);
/* If no multibyte characters or ANSI sequences, use ascii_ellipsize_mem for speed */
if (!has_ansi_seq && ascii_is_valid_n(s, old_length))
return ascii_ellipsize_mem(s, old_length, new_length, percent);
x = ((new_length - 1) * percent) / 100;
x = (new_length - 1) * percent / 100;
assert(x <= new_length - 1);
k = 0;
for (i = s; i < s + old_length; i = utf8_next_char(i)) {
char32_t c;
int w;
for (i = s; i < s + old_length; ) {
size_t slen = has_ansi_seq ? ansi_sequence_length(i, old_length - (i - s)) : 0;
if (slen > 0) {
i += slen;
continue; /* ANSI sequences don't take up any space in output */
}
char32_t c;
r = utf8_encoded_to_unichar(i, &c);
if (r < 0)
return NULL;
w = unichar_iswide(c) ? 2 : 1;
if (k + w <= x)
k += w;
else
int w = unichar_iswide(c) ? 2 : 1;
if (k + w > x)
break;
k += w;
i += r;
}
for (j = s + old_length; j > i; ) {
const char *ansi_start = s + old_length;
size_t ansi_len = 0;
for (const char *t = j = s + old_length; t > i && k < new_length; ) {
char32_t c;
int w;
const char *jj;
const char *tt;
jj = utf8_prev_char(j);
r = utf8_encoded_to_unichar(jj, &c);
if (has_ansi_seq && ansi_start >= t)
/* Figure out the previous ANSI sequence, if any */
ansi_len = previous_ansi_sequence(s, t - s, &ansi_start);
/* If the sequence extends all the way to the current position, skip it. */
if (has_ansi_seq && ansi_len > 0 && ansi_start + ansi_len == t) {
t = ansi_start;
continue;
}
tt = utf8_prev_char(t);
r = utf8_encoded_to_unichar(tt, &c);
if (r < 0)
return NULL;
w = unichar_iswide(c) ? 2 : 1;
if (k + w <= new_length) {
k += w;
j = jj;
} else
if (k + w > new_length)
break;
}
assert(i <= j);
/* we don't actually need to ellipsize */
if (i == j)
k += w;
j = t = tt; /* j should always point to the first "real" character */
}
/* We don't actually need to ellipsize */
if (i >= j)
return memdup_suffix0(s, old_length);
/* make space for ellipsis, if possible */
if (j < s + old_length)
j = utf8_next_char(j);
else if (i > s)
i = utf8_prev_char(i);
if (k >= new_length) {
/* Make space for ellipsis, if required and possible. We know that the edge character is not
* part of an ANSI sequence (because then we'd skip it). If the last character we looked at
* was wide, we don't need to make space. */
if (j < s + old_length)
j = utf8_next_char(j);
else if (i > s)
i = utf8_prev_char(i);
}
len = i - s;
len2 = s + old_length - j;
e = new(char, len + 3 + len2 + 1);
/* If we have ANSI, allow the same length as the source string + ellipsis. It'd be too involved to
* figure out what exact space is needed. Strings with ANSI sequences are most likely to be fairly
* short anyway. */
size_t alloc_len = has_ansi_seq ? old_length + 3 + 1 : len + 3 + len2 + 1;
char *e = new(char, alloc_len);
if (!e)
return NULL;
/*
printf("old_length=%zu new_length=%zu x=%zu len=%u len2=%u k=%u\n",
printf("old_length=%zu new_length=%zu x=%zu len=%zu len2=%zu k=%zu\n",
old_length, new_length, x, len, len2, k);
*/
memcpy(e, s, len);
memcpy_safe(e, s, len);
write_ellipsis(e + len, true);
memcpy(e + len + 3, j, len2);
*(e + len + 3 + len2) = '\0';
char *dst = e + len + 3;
if (has_ansi_seq)
/* Copy over any ANSI sequences in full */
for (const char *p = s + len; p < j; ) {
size_t slen = ansi_sequence_length(p, j - p);
if (slen > 0) {
memcpy(dst, p, slen);
dst += slen;
p += slen;
} else
p = utf8_next_char(p);
}
memcpy_safe(dst, j, len2);
dst[len2] = '\0';
return e;
}

View file

@ -90,7 +90,7 @@ int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) {
switch (len) {
case 1:
*ret_unichar = (char32_t)str[0];
return 0;
return 1;
case 2:
unichar = str[0] & 0x1f;
break;
@ -119,15 +119,14 @@ int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) {
}
*ret_unichar = unichar;
return 0;
return len;
}
bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newline) {
assert(str);
for (const char *p = str; length > 0;) {
int encoded_len, r;
int encoded_len;
char32_t val;
encoded_len = utf8_encoded_valid_unichar(p, length);
@ -135,8 +134,7 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin
return false;
assert(encoded_len > 0 && (size_t) encoded_len <= length);
r = utf8_encoded_to_unichar(p, &val);
if (r < 0 ||
if (utf8_encoded_to_unichar(p, &val) < 0 ||
unichar_is_control(val) ||
(!allow_newline && val == '\n'))
return false;

View file

@ -311,7 +311,7 @@ static mhd_result request_handler(
if (chunked)
return mhd_respond(connection, MHD_HTTP_BAD_REQUEST,
"Content-Length must not specified when Transfer-Encoding type is 'chunked'");
"Content-Length not allowed when Transfer-Encoding type is 'chunked'");
r = safe_atozu(header, &len);
if (r < 0)

View file

@ -93,6 +93,7 @@ simple_tests += files(
'test-fstab-util.c',
'test-glob-util.c',
'test-gpt.c',
'test-gunicode.c',
'test-hash-funcs.c',
'test-hexdecoct.c',
'test-hmac.c',

View file

@ -4,6 +4,7 @@
#include "alloc-util.h"
#include "constants.h"
#include "escape.h"
#include "string-util.h"
#include "strv.h"
#include "terminal-util.h"
@ -115,4 +116,44 @@ TEST(ellipsize) {
test_ellipsize_one("shórt");
}
TEST(ellipsize_ansi) {
const char *s = ANSI_HIGHLIGHT_YELLOW_UNDERLINE "yęllow"
ANSI_HIGHLIGHT_GREY_UNDERLINE "grěy"
ANSI_HIGHLIGHT_BLUE_UNDERLINE "blue"
ANSI_NORMAL "nórmął";
size_t len = strlen(s);
for (unsigned percent = 0; percent <= 100; percent += 15)
for (ssize_t x = 21; x >= 0; x--) {
_cleanup_free_ char *t = ellipsize_mem(s, len, x, percent);
printf("%02zd: \"%s\"\n", x, t);
assert_se(utf8_is_valid(t));
if (DEBUG_LOGGING) {
_cleanup_free_ char *e = cescape(t);
printf(" : \"%s\"\n", e);
}
}
}
TEST(ellipsize_ansi_cats) {
_cleanup_free_ char *e, *f, *g, *h;
/* Make sure we don't cut off in the middle of an ANSI escape sequence. */
e = ellipsize("01" ANSI_NORMAL "23", 4, 0);
puts(e);
assert_se(streq(e, "01" ANSI_NORMAL "23"));
f = ellipsize("ab" ANSI_NORMAL "cd", 4, 90);
puts(f);
assert_se(streq(f, "ab" ANSI_NORMAL "cd"));
g = ellipsize("🐱🐱" ANSI_NORMAL "🐱🐱" ANSI_NORMAL, 5, 0);
puts(g);
assert_se(streq(g, "" ANSI_NORMAL "🐱🐱" ANSI_NORMAL));
h = ellipsize("🐱🐱" ANSI_NORMAL "🐱🐱" ANSI_NORMAL, 5, 90);
puts(h);
assert_se(streq(h, "🐱…" ANSI_NORMAL "🐱" ANSI_NORMAL));
}
DEFINE_TEST_MAIN(LOG_INFO);

27
src/test/test-gunicode.c Normal file
View file

@ -0,0 +1,27 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "gunicode.h"
#include "tests.h"
#include "utf8.h"
TEST(unichar_iswide) {
char32_t c;
int r;
/* FIXME: the cats are wide, but we get this wrong */
for (const char *narrow = "abX_…ąęµ!" "😼😿🙀😸😻"; *narrow; narrow += r) {
r = utf8_encoded_to_unichar(narrow, &c);
bool w = unichar_iswide(c);
assert_se(r > 0);
assert_se(!w);
}
for (const char *wide = "🐱/¥"; *wide; wide += r) {
r = utf8_encoded_to_unichar(wide, &c);
bool w = unichar_iswide(c);
assert_se(r > 0);
assert_se(w);
}
}
DEFINE_TEST_MAIN(LOG_INFO);