dart-sdk/runtime/vm/unicode.cc
Alexander Markov 17d6ba15b6 [vm] Remove external strings
This change removes support for external strings from the VM along with
Dart_NewExternalLatin1String, Dart_NewExternalUTF16String and
Dart_IsExternalString Dart C API functions.

External strings are not used by the VM nor any known embedder, but
Dart VM was paying the maintenance and performance price for
the external string implementation classes.

TEST=ci

Change-Id: I094cd2d2b7ec0840e9f09e1ca9e5a7acd4e78c28
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/358760
Reviewed-by: Ryan Macnak <rmacnak@google.com>
Reviewed-by: Siva Annamalai <asiva@google.com>
Commit-Queue: Alexander Markov <alexmarkov@google.com>
2024-03-20 20:08:13 +00:00

131 lines
4.5 KiB
C++

// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
#include "platform/unicode.h"
#include "vm/allocation.h"
#include "vm/globals.h"
#include "vm/object.h"
namespace dart {
// A constant mask that can be 'and'ed with a word of data to determine if it
// is all ASCII (with no Latin1 characters).
#if defined(ARCH_IS_64_BIT)
static constexpr uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080);
#else
static constexpr uintptr_t kAsciiWordMask = 0x80808080u;
#endif
intptr_t Utf8::Length(const String& str) {
if (str.IsOneByteString()) {
// For 1-byte strings, all code points < 0x80 have single-byte UTF-8
// encodings and all >= 0x80 have two-byte encodings. To get the length,
// start with the number of code points and add the number of high bits in
// the bytes.
uintptr_t char_length = str.Length();
uintptr_t length = char_length;
NoSafepointScope no_safepoint;
const uintptr_t* data =
reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(str));
uintptr_t i;
for (i = sizeof(uintptr_t); i <= char_length; i += sizeof(uintptr_t)) {
uintptr_t chunk = *data++;
chunk &= kAsciiWordMask;
if (chunk != 0) {
// Shuffle the bits until we have a count of bits in the low nibble.
#if defined(ARCH_IS_64_BIT)
chunk += chunk >> 32;
#endif
chunk += chunk >> 16;
chunk += chunk >> 8;
length += (chunk >> 7) & 0xf;
}
}
// Take care of the tail of the string, the last length % wordsize chars.
i -= sizeof(uintptr_t);
for (; i < char_length; i++) {
if (str.CharAt(i) > kMaxOneByteChar) length++;
}
return length;
}
// Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8
// encodings.
intptr_t length = 0;
String::CodePointIterator it(str);
while (it.Next()) {
int32_t ch = it.Current();
length += Utf8::Length(ch);
}
return length;
}
intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
uintptr_t array_len = len;
intptr_t pos = 0;
ASSERT(static_cast<intptr_t>(array_len) >= Length(src));
if (src.IsOneByteString()) {
// For 1-byte strings, all code points < 0x80 have single-byte UTF-8
// encodings and all >= 0x80 have two-byte encodings.
NoSafepointScope scope;
const uintptr_t* data =
reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(src));
uintptr_t char_length = src.Length();
uintptr_t pos = 0;
ASSERT(kMaxOneByteChar + 1 == 0x80);
for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {
// Read the input one word at a time and just write it verbatim if it is
// plain ASCII, as determined by the mask.
if (i + sizeof(uintptr_t) <= char_length &&
(*data & kAsciiWordMask) == 0 &&
pos + sizeof(uintptr_t) <= array_len) {
StoreUnaligned(reinterpret_cast<uintptr_t*>(dst + pos), *data);
pos += sizeof(uintptr_t);
} else {
// Process up to one word of input that contains non-ASCII Latin1
// characters.
const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
const uint8_t* limit =
Utils::Minimum(p + sizeof(uintptr_t), p + (char_length - i));
for (; p < limit; p++) {
uint8_t c = *p;
// These calls to Length and Encode get inlined and the cases for 3
// and 4 byte sequences are removed.
intptr_t bytes = Length(c);
if (pos + bytes > array_len) {
return pos;
}
Encode(c, reinterpret_cast<char*>(dst) + pos);
pos += bytes;
}
}
data++;
}
} else {
// For two-byte strings, which can contain 3 and 4-byte UTF-8 encodings,
// which can result in surrogate pairs, use the more general code.
String::CodePointIterator it(src);
while (it.Next()) {
int32_t ch = it.Current();
ASSERT(!Utf::IsOutOfRange(ch));
if (Utf16::IsSurrogate(ch)) {
// Encode unpaired surrogates as replacement characters to ensure the
// output is valid UTF-8. Encoded size is the same (3), so the computed
// length is still valid.
ch = Utf::kReplacementChar;
}
intptr_t num_bytes = Utf8::Length(ch);
if (pos + num_bytes > len) {
break;
}
Utf8::Encode(ch, &dst[pos]);
pos += num_bytes;
}
}
return pos;
}
} // namespace dart