[vm] Adjust internal UTF-8 encoder to match dart:convert.

The breaking change https://github.com/dart-lang/sdk/issues/41100
changed the UTF-8 encoder to encode unpaired surrogates as replacement
characters.

However, the VM contains its own, internal UTF-8 encoder, which is used
for printing and for the Dart_StringToUTF8 function in the Dart API.
Here, this encoder is changed to also encode unpaired surrogates as
replacement characters.

Fixes https://github.com/dart-lang/sdk/issues/42094

Change-Id: I9d55168f67d124dbc7987fb759696a98e7526c29
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/149292
Commit-Queue: Aske Simon Christensen <askesc@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
Reviewed-by: Daco Harkes <dacoharkes@google.com>
This commit is contained in:
Aske Simon Christensen 2020-06-29 16:10:13 +00:00 committed by commit-bot@chromium.org
parent 46bd867651
commit 3c3a00f331
5 changed files with 27 additions and 5 deletions

View file

@ -90,6 +90,13 @@ Updated the Linter to `0.1.117`, which includes:
### Dart VM
* **Breaking Change** [#41100][]: When printing a string using the `print`
function, the default implementation (used when not overridden by the
embedder or the current zone) will print any unpaired surrogates in the
string as replacement characters (`U+FFFD`). Similarly, the
`Dart_StringToUTF8` function in the Dart API will convert unpaired
surrogates into replacement characters.
### Pub
* `pub run` and `pub global run` accepts a `--enable-experiment` flag enabling
experiments in the Dart VM (and language).

View file

@ -1984,6 +1984,10 @@ DART_EXPORT Dart_Handle Dart_StringToCString(Dart_Handle str,
/**
* Gets a UTF-8 encoded representation of a String.
*
* Any unpaired surrogate code points in the string will be converted as
* replacement characters (U+FFFD, 0xEF 0xBF 0xBD in UTF-8). If you need
* to preserve unpaired surrogates, use the Dart_StringToUTF16 function.
*
* \param str A string.
* \param utf8_array Returns the String represented as UTF-8 code
* units. This UTF-8 array is scope allocated and is only valid

View file

@ -18,6 +18,8 @@ class Utf : AllStatic {
static const int32_t kMaxCodePoint = 0x10FFFF;
static const int32_t kInvalidChar = 0xFFFFFFFF;
static const int32_t kReplacementChar = 0xFFFD;
static bool IsLatin1(int32_t code_point) {
return (code_point >= 0) && (code_point <= 0xFF);
}

View file

@ -1557,12 +1557,13 @@ TEST_CASE(DartAPI_MalformedStringToUTF8) {
Dart_Handle result = Dart_StringToUTF8(str1, &utf8_encoded, &utf8_length);
EXPECT_VALID(result);
EXPECT_EQ(3, utf8_length);
EXPECT_EQ(237, static_cast<intptr_t>(utf8_encoded[0]));
EXPECT_EQ(180, static_cast<intptr_t>(utf8_encoded[1]));
EXPECT_EQ(158, static_cast<intptr_t>(utf8_encoded[2]));
// Unpaired surrogate is encoded as replacement character.
EXPECT_EQ(239, static_cast<intptr_t>(utf8_encoded[0]));
EXPECT_EQ(191, static_cast<intptr_t>(utf8_encoded[1]));
EXPECT_EQ(189, static_cast<intptr_t>(utf8_encoded[2]));
Dart_Handle str2 = Dart_NewStringFromUTF8(utf8_encoded, utf8_length);
EXPECT_VALID(str2); // Standalone low surrogate, but still valid
EXPECT_VALID(str2); // Replacement character, but still valid
Dart_Handle reversed = Dart_Invoke(lib, NewString("reversed"), 0, NULL);
EXPECT_VALID(reversed); // This is also allowed.
@ -1572,7 +1573,8 @@ TEST_CASE(DartAPI_MalformedStringToUTF8) {
&utf8_length_reversed);
EXPECT_VALID(result);
EXPECT_EQ(6, utf8_length_reversed);
uint8_t expected[6] = {237, 180, 158, 237, 160, 180};
// Two unpaired surrogates are encoded as two replacement characters.
uint8_t expected[6] = {239, 191, 189, 239, 191, 189};
for (int i = 0; i < 6; i++) {
EXPECT_EQ(expected[i], utf8_encoded_reversed[i]);
}

View file

@ -119,6 +119,13 @@ intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
String::CodePointIterator it(src);
while (it.Next()) {
int32_t ch = it.Current();
ASSERT(!Utf::IsOutOfRange(ch));
if (Utf16::IsSurrogate(ch)) {
// Encode unpaired surrogates as replacement characters to ensure the
// output is valid UTF-8. Encoded size is the same (3), so the computed
// length is still valid.
ch = Utf::kReplacementChar;
}
intptr_t num_bytes = Utf8::Length(ch);
if (pos + num_bytes > len) {
break;