Fix bug in Utf8::CodePointCount which was causing some strings with latin1

characters to be stored as TwoByteStrings. Review URL: https://codereview.chromium.org//11419259 git-svn-id: https://dart.googlecode.com/svn/branches/bleeding_edge/dart@15604 260f80e4-7a28-3924-810f-c04153c831b5
2024-11-05 18:22:09 +00:00 · 2012-11-30 21:53:58 +00:00 · 2012-11-30 21:53:58 +00:00 · 76def187f8
commit 76def187f8
parent d95f19ec88
7 changed files with 42 additions and 26 deletions
--- a/runtime/vm/dart_api_impl_test.cc
+++ b/runtime/vm/dart_api_impl_test.cc
@ -513,6 +513,17 @@ TEST_CASE(ArrayValues) {


 TEST_CASE(IsString) {
+  uint8_t latin1[] = { 'o', 'n', 'e', 0xC2, 0xA2 };
+
+  Dart_Handle latin1str = Dart_NewStringFromUTF8(latin1, ARRAY_SIZE(latin1));
+  EXPECT_VALID(latin1str);
+  EXPECT(Dart_IsString(latin1str));
+  EXPECT(Dart_IsStringLatin1(latin1str));
+  EXPECT(!Dart_IsExternalString(latin1str));
+  intptr_t len = -1;
+  EXPECT_VALID(Dart_StringLength(latin1str, &len));
+  EXPECT_EQ(4, len);
+
  uint8_t data8[] = { 'o', 'n', 'e', 0x7F };

  Dart_Handle str8 = Dart_NewStringFromUTF8(data8, ARRAY_SIZE(data8));
--- a/runtime/vm/dart_api_message.cc
+++ b/runtime/vm/dart_api_message.cc
@ -804,7 +804,7 @@ bool ApiMessageWriter::WriteCObjectInlined(Dart_CObject* object,
      }

      Utf8::Type type;
-      intptr_t len = Utf8::CodePointCount(utf8_str, utf8_len, &type);
+      intptr_t len = Utf8::CodeUnitCount(utf8_str, utf8_len, &type);

      // Write out the serialization header value for this object.
      WriteInlinedHeader(object);
--- a/runtime/vm/object.cc
+++ b/runtime/vm/object.cc
@ -10063,7 +10063,7 @@ RawString* String::New(const uint8_t* utf8_array,
                       intptr_t array_len,
                       Heap::Space space) {
  Utf8::Type type;
-  intptr_t len = Utf8::CodePointCount(utf8_array, array_len, &type);
+  intptr_t len = Utf8::CodeUnitCount(utf8_array, array_len, &type);
  if (type == Utf8::kLatin1) {
    const String& strobj = String::Handle(OneByteString::New(len, space));
    if (len > 0) {
--- a/runtime/vm/object_test.cc
+++ b/runtime/vm/object_test.cc
@ -1192,13 +1192,13 @@ TEST_CASE(StringSubStringDifferentWidth) {

  const String& onestr = String::Handle(String::New(onechars));
  EXPECT(!onestr.IsNull());
-  EXPECT(!onestr.IsOneByteString());
-  EXPECT(onestr.IsTwoByteString());
+  EXPECT(onestr.IsOneByteString());
+  EXPECT(!onestr.IsTwoByteString());

  const String& onesub = String::Handle(String::SubString(onestr, 0));
  EXPECT(!onesub.IsNull());
-  EXPECT(!onestr.IsOneByteString());
-  EXPECT(onestr.IsTwoByteString());
+  EXPECT(onestr.IsOneByteString());
+  EXPECT(!onestr.IsTwoByteString());
  EXPECT_EQ(onesub.Length(), 3);

  // Create 1- and 2-byte substrings from a 2-byte source string.
@ -1290,7 +1290,7 @@ TEST_CASE(StringFromUtf8Literal) {
      0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
    };
    const String& str = String::Handle(String::New(src));
-    EXPECT(str.IsTwoByteString());
+    EXPECT(str.IsOneByteString());
    intptr_t expected_length = sizeof(expected);
    EXPECT_EQ(expected_length, str.Length());
    for (int i = 0; i < str.Length(); ++i) {
--- a/runtime/vm/symbols.cc
+++ b/runtime/vm/symbols.cc
@ -102,7 +102,7 @@ RawString* Symbols::New(const char* str) {
  Utf8::Type type;
  intptr_t str_len = strlen(str);
  const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str);
-  intptr_t len = Utf8::CodePointCount(utf8_array, str_len, &type);
+  intptr_t len = Utf8::CodeUnitCount(utf8_array, str_len, &type);
  Zone* zone = Isolate::Current()->current_zone();
  if (len == 0) {
    return Symbols::New(reinterpret_cast<uint8_t*>(NULL), 0);
--- a/runtime/vm/unicode.cc
+++ b/runtime/vm/unicode.cc
@ -53,23 +53,25 @@ const uint32_t Utf8::kOverlongMinimum[7] = {
 };


-// Returns a count of the number of UTF-8 trail bytes.
-intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
-                              intptr_t array_len,
-                              Type* type) {
+// Returns the most restricted coding form in which the sequence of utf8
+// characters in 'utf8_array' can be represented in, and the number of
+// code units needed in that form.
+intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,
+                             intptr_t array_len,
+                             Type* type) {
  intptr_t len = 0;
  Type char_type = kLatin1;
  for (intptr_t i = 0; i < array_len; i++) {
    uint8_t code_unit = utf8_array[i];
    if (!IsTrailByte(code_unit)) {
      ++len;
-    }
-    if (!IsLatin1SequenceStart(code_unit)) {  // > U+00FF
-      if (IsSupplementarySequenceStart(code_unit)) {  // >= U+10000
-        char_type = kSupplementary;
-        ++len;
-      } else if (char_type == kLatin1) {
-        char_type = kBMP;
+      if (!IsLatin1SequenceStart(code_unit)) {  // > U+00FF
+        if (IsSupplementarySequenceStart(code_unit)) {  // >= U+10000
+          char_type = kSupplementary;
+          ++len;
+        } else if (char_type == kLatin1) {
+          char_type = kBMP;
+        }
      }
    }
  }
--- a/runtime/vm/unicode.h
+++ b/runtime/vm/unicode.h
@ -43,9 +43,12 @@ class Utf8 : AllStatic {
    kSupplementary,  // Supplementary code point [U+010000, U+10FFFF].
  };

-  static intptr_t CodePointCount(const uint8_t* utf8_array,
-                                 intptr_t array_len,
-                                 Type* type);
+  // Returns the most restricted coding form in which the sequence of utf8
+  // characters in 'utf8_array' can be represented in, and the number of
+  // code units needed in that form.
+  static intptr_t CodeUnitCount(const uint8_t* utf8_array,
+                                intptr_t array_len,
+                                Type* type);

  // Returns true if 'utf8_array' is a valid UTF-8 string.
  static bool IsValid(const uint8_t* utf8_array, intptr_t array_len);
@ -83,7 +86,7 @@ class Utf8 : AllStatic {
  static const int32_t kMaxFourByteChar  = Utf::kMaxCodePoint;

  static bool IsTrailByte(uint8_t code_unit) {
-    return (code_unit & 0xc0) == 0x80;
+    return (code_unit & 0xC0) == 0x80;
  }

  static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) {
@ -91,12 +94,12 @@ class Utf8 : AllStatic {
  }

  static bool IsLatin1SequenceStart(uint8_t code_unit) {
-    // Check is codepoint is <= U+00FF
-    return (code_unit <= Utf8::kMaxOneByteChar);
+    // Check if utf8 sequence is the start of a codepoint <= U+00FF
+    return (code_unit <= 0xC3);
  }

  static bool IsSupplementarySequenceStart(uint8_t code_unit) {
-    // Check is codepoint is >= U+10000.
+    // Check if utf8 sequence is the start of a codepoint >= U+10000.
    return (code_unit >= 0xF0);
  }