mirror of
https://github.com/dart-lang/sdk
synced 2024-11-05 18:22:09 +00:00
Fix bug in Utf8::CodePointCount which was causing some strings with latin1
characters to be stored as TwoByteStrings. Review URL: https://codereview.chromium.org//11419259 git-svn-id: https://dart.googlecode.com/svn/branches/bleeding_edge/dart@15604 260f80e4-7a28-3924-810f-c04153c831b5
This commit is contained in:
parent
d95f19ec88
commit
76def187f8
7 changed files with 42 additions and 26 deletions
|
@ -513,6 +513,17 @@ TEST_CASE(ArrayValues) {
|
|||
|
||||
|
||||
TEST_CASE(IsString) {
|
||||
uint8_t latin1[] = { 'o', 'n', 'e', 0xC2, 0xA2 };
|
||||
|
||||
Dart_Handle latin1str = Dart_NewStringFromUTF8(latin1, ARRAY_SIZE(latin1));
|
||||
EXPECT_VALID(latin1str);
|
||||
EXPECT(Dart_IsString(latin1str));
|
||||
EXPECT(Dart_IsStringLatin1(latin1str));
|
||||
EXPECT(!Dart_IsExternalString(latin1str));
|
||||
intptr_t len = -1;
|
||||
EXPECT_VALID(Dart_StringLength(latin1str, &len));
|
||||
EXPECT_EQ(4, len);
|
||||
|
||||
uint8_t data8[] = { 'o', 'n', 'e', 0x7F };
|
||||
|
||||
Dart_Handle str8 = Dart_NewStringFromUTF8(data8, ARRAY_SIZE(data8));
|
||||
|
|
|
@ -804,7 +804,7 @@ bool ApiMessageWriter::WriteCObjectInlined(Dart_CObject* object,
|
|||
}
|
||||
|
||||
Utf8::Type type;
|
||||
intptr_t len = Utf8::CodePointCount(utf8_str, utf8_len, &type);
|
||||
intptr_t len = Utf8::CodeUnitCount(utf8_str, utf8_len, &type);
|
||||
|
||||
// Write out the serialization header value for this object.
|
||||
WriteInlinedHeader(object);
|
||||
|
|
|
@ -10063,7 +10063,7 @@ RawString* String::New(const uint8_t* utf8_array,
|
|||
intptr_t array_len,
|
||||
Heap::Space space) {
|
||||
Utf8::Type type;
|
||||
intptr_t len = Utf8::CodePointCount(utf8_array, array_len, &type);
|
||||
intptr_t len = Utf8::CodeUnitCount(utf8_array, array_len, &type);
|
||||
if (type == Utf8::kLatin1) {
|
||||
const String& strobj = String::Handle(OneByteString::New(len, space));
|
||||
if (len > 0) {
|
||||
|
|
|
@ -1192,13 +1192,13 @@ TEST_CASE(StringSubStringDifferentWidth) {
|
|||
|
||||
const String& onestr = String::Handle(String::New(onechars));
|
||||
EXPECT(!onestr.IsNull());
|
||||
EXPECT(!onestr.IsOneByteString());
|
||||
EXPECT(onestr.IsTwoByteString());
|
||||
EXPECT(onestr.IsOneByteString());
|
||||
EXPECT(!onestr.IsTwoByteString());
|
||||
|
||||
const String& onesub = String::Handle(String::SubString(onestr, 0));
|
||||
EXPECT(!onesub.IsNull());
|
||||
EXPECT(!onestr.IsOneByteString());
|
||||
EXPECT(onestr.IsTwoByteString());
|
||||
EXPECT(onestr.IsOneByteString());
|
||||
EXPECT(!onestr.IsTwoByteString());
|
||||
EXPECT_EQ(onesub.Length(), 3);
|
||||
|
||||
// Create 1- and 2-byte substrings from a 2-byte source string.
|
||||
|
@ -1290,7 +1290,7 @@ TEST_CASE(StringFromUtf8Literal) {
|
|||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
};
|
||||
const String& str = String::Handle(String::New(src));
|
||||
EXPECT(str.IsTwoByteString());
|
||||
EXPECT(str.IsOneByteString());
|
||||
intptr_t expected_length = sizeof(expected);
|
||||
EXPECT_EQ(expected_length, str.Length());
|
||||
for (int i = 0; i < str.Length(); ++i) {
|
||||
|
|
|
@ -102,7 +102,7 @@ RawString* Symbols::New(const char* str) {
|
|||
Utf8::Type type;
|
||||
intptr_t str_len = strlen(str);
|
||||
const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str);
|
||||
intptr_t len = Utf8::CodePointCount(utf8_array, str_len, &type);
|
||||
intptr_t len = Utf8::CodeUnitCount(utf8_array, str_len, &type);
|
||||
Zone* zone = Isolate::Current()->current_zone();
|
||||
if (len == 0) {
|
||||
return Symbols::New(reinterpret_cast<uint8_t*>(NULL), 0);
|
||||
|
|
|
@ -53,23 +53,25 @@ const uint32_t Utf8::kOverlongMinimum[7] = {
|
|||
};
|
||||
|
||||
|
||||
// Returns a count of the number of UTF-8 trail bytes.
|
||||
intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
|
||||
intptr_t array_len,
|
||||
Type* type) {
|
||||
// Returns the most restricted coding form in which the sequence of utf8
|
||||
// characters in 'utf8_array' can be represented in, and the number of
|
||||
// code units needed in that form.
|
||||
intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,
|
||||
intptr_t array_len,
|
||||
Type* type) {
|
||||
intptr_t len = 0;
|
||||
Type char_type = kLatin1;
|
||||
for (intptr_t i = 0; i < array_len; i++) {
|
||||
uint8_t code_unit = utf8_array[i];
|
||||
if (!IsTrailByte(code_unit)) {
|
||||
++len;
|
||||
}
|
||||
if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF
|
||||
if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000
|
||||
char_type = kSupplementary;
|
||||
++len;
|
||||
} else if (char_type == kLatin1) {
|
||||
char_type = kBMP;
|
||||
if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF
|
||||
if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000
|
||||
char_type = kSupplementary;
|
||||
++len;
|
||||
} else if (char_type == kLatin1) {
|
||||
char_type = kBMP;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,9 +43,12 @@ class Utf8 : AllStatic {
|
|||
kSupplementary, // Supplementary code point [U+010000, U+10FFFF].
|
||||
};
|
||||
|
||||
static intptr_t CodePointCount(const uint8_t* utf8_array,
|
||||
intptr_t array_len,
|
||||
Type* type);
|
||||
// Returns the most restricted coding form in which the sequence of utf8
|
||||
// characters in 'utf8_array' can be represented in, and the number of
|
||||
// code units needed in that form.
|
||||
static intptr_t CodeUnitCount(const uint8_t* utf8_array,
|
||||
intptr_t array_len,
|
||||
Type* type);
|
||||
|
||||
// Returns true if 'utf8_array' is a valid UTF-8 string.
|
||||
static bool IsValid(const uint8_t* utf8_array, intptr_t array_len);
|
||||
|
@ -83,7 +86,7 @@ class Utf8 : AllStatic {
|
|||
static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint;
|
||||
|
||||
static bool IsTrailByte(uint8_t code_unit) {
|
||||
return (code_unit & 0xc0) == 0x80;
|
||||
return (code_unit & 0xC0) == 0x80;
|
||||
}
|
||||
|
||||
static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) {
|
||||
|
@ -91,12 +94,12 @@ class Utf8 : AllStatic {
|
|||
}
|
||||
|
||||
static bool IsLatin1SequenceStart(uint8_t code_unit) {
|
||||
// Check is codepoint is <= U+00FF
|
||||
return (code_unit <= Utf8::kMaxOneByteChar);
|
||||
// Check if utf8 sequence is the start of a codepoint <= U+00FF
|
||||
return (code_unit <= 0xC3);
|
||||
}
|
||||
|
||||
static bool IsSupplementarySequenceStart(uint8_t code_unit) {
|
||||
// Check is codepoint is >= U+10000.
|
||||
// Check if utf8 sequence is the start of a codepoint >= U+10000.
|
||||
return (code_unit >= 0xF0);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue