[vm] Adjust internal UTF-8 encoder to match dart:convert.

The breaking change https://github.com/dart-lang/sdk/issues/41100 changed the UTF-8 encoder to encode unpaired surrogates as replacement characters. However, the VM contains its own, internal UTF-8 encoder, which is used for printing and for the Dart_StringToUTF8 function in the Dart API. Here, this encoder is changed to also encode unpaired surrogates as replacement characters. Fixes https://github.com/dart-lang/sdk/issues/42094 Change-Id: I9d55168f67d124dbc7987fb759696a98e7526c29 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/149292 Commit-Queue: Aske Simon Christensen <askesc@google.com> Reviewed-by: Martin Kustermann <kustermann@google.com> Reviewed-by: Daco Harkes <dacoharkes@google.com>
2024-10-01 17:41:53 +00:00 · 2020-06-29 16:10:13 +00:00 · 2020-06-29 16:10:13 +00:00 · 3c3a00f331
parent 46bd867651
commit 3c3a00f331
5 changed files with 27 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -90,6 +90,13 @@ Updated the Linter to `0.1.117`, which includes:

 ### Dart VM

+*   **Breaking Change** [#41100][]: When printing a string using the `print`
+    function, the default implementation (used when not overridden by the
+    embedder or the current zone) will print any unpaired surrogates in the
+    string as replacement characters (`U+FFFD`). Similarly, the
+    `Dart_StringToUTF8` function in the Dart API will convert unpaired
+    surrogates into replacement characters.
+
 ### Pub
 * `pub run` and `pub global run` accepts a `--enable-experiment` flag enabling
  experiments in the Dart VM (and language).
--- a/runtime/include/dart_api.h
+++ b/runtime/include/dart_api.h
@ -1984,6 +1984,10 @@ DART_EXPORT Dart_Handle Dart_StringToCString(Dart_Handle str,
 /**
 * Gets a UTF-8 encoded representation of a String.
 *
+ * Any unpaired surrogate code points in the string will be converted as
+ * replacement characters (U+FFFD, 0xEF 0xBF 0xBD in UTF-8). If you need
+ * to preserve unpaired surrogates, use the Dart_StringToUTF16 function.
+ *
 * \param str A string.
 * \param utf8_array Returns the String represented as UTF-8 code
 *   units.  This UTF-8 array is scope allocated and is only valid
--- a/runtime/platform/unicode.h
+++ b/runtime/platform/unicode.h
@ -18,6 +18,8 @@ class Utf : AllStatic {
  static const int32_t kMaxCodePoint = 0x10FFFF;
  static const int32_t kInvalidChar = 0xFFFFFFFF;

+  static const int32_t kReplacementChar = 0xFFFD;
+
  static bool IsLatin1(int32_t code_point) {
    return (code_point >= 0) && (code_point <= 0xFF);
  }
--- a/runtime/vm/dart_api_impl_test.cc
+++ b/runtime/vm/dart_api_impl_test.cc
@ -1557,12 +1557,13 @@ TEST_CASE(DartAPI_MalformedStringToUTF8) {
  Dart_Handle result = Dart_StringToUTF8(str1, &utf8_encoded, &utf8_length);
  EXPECT_VALID(result);
  EXPECT_EQ(3, utf8_length);
-  EXPECT_EQ(237, static_cast<intptr_t>(utf8_encoded[0]));
-  EXPECT_EQ(180, static_cast<intptr_t>(utf8_encoded[1]));
-  EXPECT_EQ(158, static_cast<intptr_t>(utf8_encoded[2]));
+  // Unpaired surrogate is encoded as replacement character.
+  EXPECT_EQ(239, static_cast<intptr_t>(utf8_encoded[0]));
+  EXPECT_EQ(191, static_cast<intptr_t>(utf8_encoded[1]));
+  EXPECT_EQ(189, static_cast<intptr_t>(utf8_encoded[2]));

  Dart_Handle str2 = Dart_NewStringFromUTF8(utf8_encoded, utf8_length);
-  EXPECT_VALID(str2);  // Standalone low surrogate, but still valid
+  EXPECT_VALID(str2);  // Replacement character, but still valid

  Dart_Handle reversed = Dart_Invoke(lib, NewString("reversed"), 0, NULL);
  EXPECT_VALID(reversed);  // This is also allowed.
@ -1572,7 +1573,8 @@ TEST_CASE(DartAPI_MalformedStringToUTF8) {
                             &utf8_length_reversed);
  EXPECT_VALID(result);
  EXPECT_EQ(6, utf8_length_reversed);
-  uint8_t expected[6] = {237, 180, 158, 237, 160, 180};
+  // Two unpaired surrogates are encoded as two replacement characters.
+  uint8_t expected[6] = {239, 191, 189, 239, 191, 189};
  for (int i = 0; i < 6; i++) {
    EXPECT_EQ(expected[i], utf8_encoded_reversed[i]);
  }
--- a/runtime/vm/unicode.cc
+++ b/runtime/vm/unicode.cc
@ -119,6 +119,13 @@ intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
    String::CodePointIterator it(src);
    while (it.Next()) {
      int32_t ch = it.Current();
+      ASSERT(!Utf::IsOutOfRange(ch));
+      if (Utf16::IsSurrogate(ch)) {
+        // Encode unpaired surrogates as replacement characters to ensure the
+        // output is valid UTF-8. Encoded size is the same (3), so the computed
+        // length is still valid.
+        ch = Utf::kReplacementChar;
+      }
      intptr_t num_bytes = Utf8::Length(ch);
      if (pos + num_bytes > len) {
        break;