2013-07-15 10:26:38 +00:00
|
|
|
|
// Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
|
|
|
|
|
// for details. All rights reserved. Use of this source code is governed by a
|
|
|
|
|
// BSD-style license that can be found in the LICENSE file.
|
|
|
|
|
|
2019-11-15 00:22:47 +00:00
|
|
|
|
// @dart = 2.6
|
|
|
|
|
|
2017-07-07 18:18:20 +00:00
|
|
|
|
part of dart.convert;
|
2013-07-15 10:26:38 +00:00
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// The Unicode Replacement character `U+FFFD` (<28>).
|
2017-11-14 12:59:14 +00:00
|
|
|
|
const int unicodeReplacementCharacterRune = 0xFFFD;
|
2013-08-26 10:37:25 +00:00
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// The Unicode Byte Order Marker (BOM) character `U+FEFF`.
|
2017-11-14 12:59:14 +00:00
|
|
|
|
const int unicodeBomCharacterRune = 0xFEFF;
|
2013-10-01 12:11:02 +00:00
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// An instance of the default implementation of the [Utf8Codec].
|
|
|
|
|
///
|
|
|
|
|
/// This instance provides a convenient access to the most common UTF-8
|
|
|
|
|
/// use cases.
|
|
|
|
|
///
|
|
|
|
|
/// Examples:
|
|
|
|
|
///
|
|
|
|
|
/// var encoded = utf8.encode("Îñţérñåţîöñåļîžåţîờñ");
|
|
|
|
|
/// var decoded = utf8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,
|
|
|
|
|
/// 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]);
|
|
|
|
|
const Utf8Codec utf8 = Utf8Codec();
|
|
|
|
|
|
|
|
|
|
/// A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes
|
|
|
|
|
/// UTF-8 code units to strings.
|
2013-08-26 10:37:25 +00:00
|
|
|
|
class Utf8Codec extends Encoding {
|
2013-07-22 12:25:16 +00:00
|
|
|
|
final bool _allowMalformed;
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Instantiates a new [Utf8Codec].
|
|
|
|
|
///
|
|
|
|
|
/// The optional [allowMalformed] argument defines how [decoder] (and [decode])
|
|
|
|
|
/// deal with invalid or unterminated character sequences.
|
|
|
|
|
///
|
|
|
|
|
/// If it is `true` (and not overridden at the method invocation) [decode] and
|
|
|
|
|
/// the [decoder] replace invalid (or unterminated) octet
|
|
|
|
|
/// sequences with the Unicode Replacement character `U+FFFD` (<28>). Otherwise
|
|
|
|
|
/// they throw a [FormatException].
|
|
|
|
|
const Utf8Codec({bool allowMalformed = false})
|
2013-07-22 12:25:16 +00:00
|
|
|
|
: _allowMalformed = allowMalformed;
|
|
|
|
|
|
Fix various Encoding documentation. Fixes #31827, #31828, #31829, #31830, #31831
1. Override documentation for Codec.decoder and Codec.encoder so that
mysterious type parameters `T` and `S` don't appear in documentation, e.g.
https://api.dartlang.org/stable/2.0.0/dart-convert/Encoding/decoder.html.
This fixes the documentation for subclasses, including AsciiCodec,
Latin1Codec, Utf8Codec, and SystemEncoding.
2. Fix typo in `const AsciiCodec()`.
3. Override documentation for AsciiCodec.name.
4. Override documentation for Latin1Codec.name.
5. Override documentation for Utf8Codec.name.
6. Document `const SystemEncoding()`.
Bug: https://github.com/dart-lang/sdk/issues/31827, https://github.com/dart-lang/sdk/issues/31828, https://github.com/dart-lang/sdk/issues/31829, https://github.com/dart-lang/sdk/issues/31830, https://github.com/dart-lang/sdk/issues/31831
Change-Id: I292e1cbafff991b55ab67dba684692b686f3b8de
Reviewed-on: https://dart-review.googlesource.com/c/82788
Reviewed-by: Lasse R.H. Nielsen <lrn@google.com>
Commit-Queue: Samuel Rawlins <srawlins@google.com>
2018-11-03 14:59:18 +00:00
|
|
|
|
/// The name of this codec, "utf-8".
|
2013-08-26 10:37:25 +00:00
|
|
|
|
String get name => "utf-8";
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
|
|
|
|
|
/// corresponding string.
|
|
|
|
|
///
|
|
|
|
|
/// If the [codeUnits] start with the encoding of a
|
|
|
|
|
/// [unicodeBomCharacterRune], that character is discarded.
|
|
|
|
|
///
|
|
|
|
|
/// If [allowMalformed] is `true` the decoder replaces invalid (or
|
|
|
|
|
/// unterminated) character sequences with the Unicode Replacement character
|
|
|
|
|
/// `U+FFFD` (<28>). Otherwise it throws a [FormatException].
|
|
|
|
|
///
|
|
|
|
|
/// If [allowMalformed] is not given, it defaults to the `allowMalformed` that
|
|
|
|
|
/// was used to instantiate `this`.
|
2017-03-16 17:22:19 +00:00
|
|
|
|
String decode(List<int> codeUnits, {bool allowMalformed}) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
allowMalformed ??= _allowMalformed;
|
|
|
|
|
return Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
|
2013-07-22 12:25:16 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-29 20:44:28 +00:00
|
|
|
|
Utf8Encoder get encoder => const Utf8Encoder();
|
2014-11-20 11:23:51 +00:00
|
|
|
|
Utf8Decoder get decoder {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
return Utf8Decoder(allowMalformed: _allowMalformed);
|
2013-07-22 12:25:16 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// This class converts strings to their UTF-8 code units (a list of
|
|
|
|
|
/// unsigned 8-bit integers).
|
2017-05-01 13:20:26 +00:00
|
|
|
|
class Utf8Encoder extends Converter<String, List<int>> {
|
2013-08-26 10:37:25 +00:00
|
|
|
|
const Utf8Encoder();
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Converts [string] to its UTF-8 code units (a list of
|
|
|
|
|
/// unsigned 8-bit integers).
|
|
|
|
|
///
|
|
|
|
|
/// If [start] and [end] are provided, only the substring
|
|
|
|
|
/// `string.substring(start, end)` is converted.
|
2019-06-20 06:43:44 +00:00
|
|
|
|
Uint8List convert(String string, [int start = 0, int end]) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var stringLength = string.length;
|
|
|
|
|
end = RangeError.checkValidRange(start, end, stringLength);
|
|
|
|
|
var length = end - start;
|
|
|
|
|
if (length == 0) return Uint8List(0);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
// Create a new encoder with a length that is guaranteed to be big enough.
|
2014-11-20 11:23:51 +00:00
|
|
|
|
// A single code unit uses at most 3 bytes, a surrogate pair at most 4.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var encoder = _Utf8Encoder.withBufferSize(length * 3);
|
|
|
|
|
var endPosition = encoder._fillBuffer(string, start, end);
|
2014-11-20 11:23:51 +00:00
|
|
|
|
assert(endPosition >= end - 1);
|
|
|
|
|
if (endPosition != end) {
|
|
|
|
|
// Encoding skipped the last code unit.
|
|
|
|
|
// That can only happen if the last code unit is a leadsurrogate.
|
|
|
|
|
// Force encoding of the lead surrogate by itself.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var lastCodeUnit = string.codeUnitAt(end - 1);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
assert(_isLeadSurrogate(lastCodeUnit));
|
|
|
|
|
// We use a non-surrogate as `nextUnit` so that _writeSurrogate just
|
|
|
|
|
// writes the lead-surrogate.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var wasCombined = encoder._writeSurrogate(lastCodeUnit, 0);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
assert(!wasCombined);
|
|
|
|
|
}
|
|
|
|
|
return encoder._buffer.sublist(0, encoder._bufferIndex);
|
|
|
|
|
}
|
2013-07-25 13:26:09 +00:00
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Starts a chunked conversion.
|
|
|
|
|
///
|
|
|
|
|
/// The converter works more efficiently if the given [sink] is a
|
|
|
|
|
/// [ByteConversionSink].
|
2014-03-20 10:13:01 +00:00
|
|
|
|
StringConversionSink startChunkedConversion(Sink<List<int>> sink) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
return _Utf8EncoderSink(
|
|
|
|
|
sink is ByteConversionSink ? sink : ByteConversionSink.from(sink));
|
2013-07-25 13:26:09 +00:00
|
|
|
|
}
|
2013-07-26 10:35:42 +00:00
|
|
|
|
|
|
|
|
|
// Override the base-classes bind, to provide a better type.
|
|
|
|
|
Stream<List<int>> bind(Stream<String> stream) => super.bind(stream);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// This class encodes Strings to UTF-8 code units (unsigned 8 bit integers).
|
2013-07-23 14:41:21 +00:00
|
|
|
|
// TODO(floitsch): make this class public.
|
|
|
|
|
class _Utf8Encoder {
|
|
|
|
|
int _carry = 0;
|
|
|
|
|
int _bufferIndex = 0;
|
2019-06-20 06:43:44 +00:00
|
|
|
|
final Uint8List _buffer;
|
2013-07-23 14:41:21 +00:00
|
|
|
|
|
|
|
|
|
static const _DEFAULT_BYTE_BUFFER_SIZE = 1024;
|
|
|
|
|
|
|
|
|
|
_Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE);
|
|
|
|
|
|
|
|
|
|
_Utf8Encoder.withBufferSize(int bufferSize)
|
2014-01-31 13:11:25 +00:00
|
|
|
|
: _buffer = _createBuffer(bufferSize);
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Allow an implementation to pick the most efficient way of storing bytes.
|
2019-06-20 06:43:44 +00:00
|
|
|
|
static Uint8List _createBuffer(int size) => Uint8List(size);
|
2018-10-26 23:05:24 +00:00
|
|
|
|
|
|
|
|
|
/// Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and
|
|
|
|
|
/// writes it to [_buffer].
|
|
|
|
|
///
|
|
|
|
|
/// Returns true if the [nextCodeUnit] was combined with the
|
|
|
|
|
/// [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing
|
|
|
|
|
/// surrogate and has not been written yet.
|
|
|
|
|
///
|
|
|
|
|
/// It is safe to pass 0 for [nextCodeUnit] in which case only the leading
|
|
|
|
|
/// surrogate is written.
|
2013-07-23 14:41:21 +00:00
|
|
|
|
bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {
|
|
|
|
|
if (_isTailSurrogate(nextCodeUnit)) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
// If the rune is encoded with 2 code-units then it must be encoded
|
|
|
|
|
// with 4 bytes in UTF-8.
|
|
|
|
|
assert(rune > _THREE_BYTE_LIMIT);
|
|
|
|
|
assert(rune <= _FOUR_BYTE_LIMIT);
|
|
|
|
|
_buffer[_bufferIndex++] = 0xF0 | (rune >> 18);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | ((rune >> 12) & 0x3f);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
|
|
|
|
|
return true;
|
|
|
|
|
} else {
|
|
|
|
|
// TODO(floitsch): allow to throw on malformed strings.
|
|
|
|
|
// Encode the half-surrogate directly into UTF-8. This yields
|
|
|
|
|
// invalid UTF-8, but we started out with invalid UTF-16.
|
|
|
|
|
|
|
|
|
|
// Surrogates are always encoded in 3 bytes in UTF-8.
|
|
|
|
|
_buffer[_bufferIndex++] = 0xE0 | (leadingSurrogate >> 12);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | ((leadingSurrogate >> 6) & 0x3f);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | (leadingSurrogate & 0x3f);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Fills the [_buffer] with as many characters as possible.
|
|
|
|
|
///
|
|
|
|
|
/// Does not encode any trailing lead-surrogate. This must be done by the
|
|
|
|
|
/// caller.
|
|
|
|
|
///
|
|
|
|
|
/// Returns the position in the string. The returned index points to the
|
|
|
|
|
/// first code unit that hasn't been encoded.
|
2013-07-23 14:41:21 +00:00
|
|
|
|
int _fillBuffer(String str, int start, int end) {
|
|
|
|
|
if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) {
|
|
|
|
|
// Don't handle a trailing lead-surrogate in this loop. The caller has
|
|
|
|
|
// to deal with those.
|
|
|
|
|
end--;
|
|
|
|
|
}
|
|
|
|
|
int stringIndex;
|
|
|
|
|
for (stringIndex = start; stringIndex < end; stringIndex++) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var codeUnit = str.codeUnitAt(stringIndex);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
// ASCII has the same representation in UTF-8 and UTF-16.
|
2013-08-20 14:03:35 +00:00
|
|
|
|
if (codeUnit <= _ONE_BYTE_LIMIT) {
|
2013-07-23 14:41:21 +00:00
|
|
|
|
if (_bufferIndex >= _buffer.length) break;
|
|
|
|
|
_buffer[_bufferIndex++] = codeUnit;
|
|
|
|
|
} else if (_isLeadSurrogate(codeUnit)) {
|
|
|
|
|
if (_bufferIndex + 3 >= _buffer.length) break;
|
|
|
|
|
// Note that it is safe to read the next code unit. We decremented
|
|
|
|
|
// [end] above when the last valid code unit was a leading surrogate.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var nextCodeUnit = str.codeUnitAt(stringIndex + 1);
|
|
|
|
|
var wasCombined = _writeSurrogate(codeUnit, nextCodeUnit);
|
2013-07-23 14:41:21 +00:00
|
|
|
|
if (wasCombined) stringIndex++;
|
|
|
|
|
} else {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var rune = codeUnit;
|
2013-07-23 14:41:21 +00:00
|
|
|
|
if (rune <= _TWO_BYTE_LIMIT) {
|
|
|
|
|
if (_bufferIndex + 1 >= _buffer.length) break;
|
|
|
|
|
_buffer[_bufferIndex++] = 0xC0 | (rune >> 6);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
|
|
|
|
|
} else {
|
|
|
|
|
assert(rune <= _THREE_BYTE_LIMIT);
|
|
|
|
|
if (_bufferIndex + 2 >= _buffer.length) break;
|
|
|
|
|
_buffer[_bufferIndex++] = 0xE0 | (rune >> 12);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | ((rune >> 6) & 0x3f);
|
|
|
|
|
_buffer[_bufferIndex++] = 0x80 | (rune & 0x3f);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return stringIndex;
|
|
|
|
|
}
|
2013-07-15 10:26:38 +00:00
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// This class encodes chunked strings to UTF-8 code units (unsigned 8-bit
|
|
|
|
|
/// integers).
|
2013-07-25 13:26:09 +00:00
|
|
|
|
class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {
|
|
|
|
|
final ByteConversionSink _sink;
|
|
|
|
|
|
|
|
|
|
_Utf8EncoderSink(this._sink);
|
|
|
|
|
|
|
|
|
|
void close() {
|
|
|
|
|
if (_carry != 0) {
|
|
|
|
|
// addSlice will call close again, but then the carry must be equal to 0.
|
|
|
|
|
addSlice("", 0, 0, true);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
_sink.close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void addSlice(String str, int start, int end, bool isLast) {
|
|
|
|
|
_bufferIndex = 0;
|
|
|
|
|
|
|
|
|
|
if (start == end && !isLast) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (_carry != 0) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var nextCodeUnit = 0;
|
2013-07-25 13:26:09 +00:00
|
|
|
|
if (start != end) {
|
|
|
|
|
nextCodeUnit = str.codeUnitAt(start);
|
|
|
|
|
} else {
|
|
|
|
|
assert(isLast);
|
|
|
|
|
}
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var wasCombined = _writeSurrogate(_carry, nextCodeUnit);
|
2013-07-25 13:26:09 +00:00
|
|
|
|
// Either we got a non-empty string, or we must not have been combined.
|
2017-03-16 17:22:19 +00:00
|
|
|
|
assert(!wasCombined || start != end);
|
2013-07-25 13:26:09 +00:00
|
|
|
|
if (wasCombined) start++;
|
|
|
|
|
_carry = 0;
|
|
|
|
|
}
|
|
|
|
|
do {
|
|
|
|
|
start = _fillBuffer(str, start, end);
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var isLastSlice = isLast && (start == end);
|
2013-07-25 13:26:09 +00:00
|
|
|
|
if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {
|
|
|
|
|
if (isLast && _bufferIndex < _buffer.length - 3) {
|
|
|
|
|
// There is still space for the last incomplete surrogate.
|
|
|
|
|
// We use a non-surrogate as second argument. This way the
|
|
|
|
|
// function will just add the surrogate-half to the buffer.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);
|
2013-07-25 13:26:09 +00:00
|
|
|
|
assert(!hasBeenCombined);
|
|
|
|
|
} else {
|
|
|
|
|
// Otherwise store it in the carry. If isLast is true, then
|
|
|
|
|
// close will flush the last carry.
|
|
|
|
|
_carry = str.codeUnitAt(start);
|
|
|
|
|
}
|
|
|
|
|
start++;
|
|
|
|
|
}
|
|
|
|
|
_sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);
|
|
|
|
|
_bufferIndex = 0;
|
|
|
|
|
} while (start < end);
|
|
|
|
|
if (isLast) close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it
|
|
|
|
|
// needs to deal with malformed input.
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// This class converts UTF-8 code units (lists of unsigned 8-bit integers)
|
|
|
|
|
/// to a string.
|
2017-05-01 13:20:26 +00:00
|
|
|
|
class Utf8Decoder extends Converter<List<int>, String> {
|
2013-07-17 11:35:12 +00:00
|
|
|
|
final bool _allowMalformed;
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Instantiates a new [Utf8Decoder].
|
|
|
|
|
///
|
|
|
|
|
/// The optional [allowMalformed] argument defines how [convert] deals
|
|
|
|
|
/// with invalid or unterminated character sequences.
|
|
|
|
|
///
|
|
|
|
|
/// If it is `true` [convert] replaces invalid (or unterminated) character
|
|
|
|
|
/// sequences with the Unicode Replacement character `U+FFFD` (<28>). Otherwise
|
|
|
|
|
/// it throws a [FormatException].
|
|
|
|
|
const Utf8Decoder({bool allowMalformed = false})
|
|
|
|
|
: _allowMalformed = allowMalformed;
|
|
|
|
|
|
|
|
|
|
/// Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
|
|
|
|
|
/// corresponding string.
|
|
|
|
|
///
|
|
|
|
|
/// Uses the code units from [start] to, but no including, [end].
|
|
|
|
|
/// If [end] is omitted, it defaults to `codeUnits.length`.
|
|
|
|
|
///
|
|
|
|
|
/// If the [codeUnits] start with the encoding of a
|
|
|
|
|
/// [unicodeBomCharacterRune], that character is discarded.
|
2014-11-20 11:23:51 +00:00
|
|
|
|
String convert(List<int> codeUnits, [int start = 0, int end]) {
|
2015-01-21 01:29:14 +00:00
|
|
|
|
// Allow the implementation to intercept and specialize based on the type
|
|
|
|
|
// of codeUnits.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var result = _convertIntercepted(_allowMalformed, codeUnits, start, end);
|
2015-01-21 01:29:14 +00:00
|
|
|
|
if (result != null) {
|
2015-11-17 02:29:44 +00:00
|
|
|
|
return result;
|
2015-01-21 01:29:14 +00:00
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var length = codeUnits.length;
|
|
|
|
|
end = RangeError.checkValidRange(start, end, length);
|
2019-02-20 17:19:14 +00:00
|
|
|
|
|
|
|
|
|
// Fast case for ASCII strings avoids StringBuffer/_Utf8Decoder.
|
|
|
|
|
int oneBytes = _scanOneByteCharacters(codeUnits, start, end);
|
|
|
|
|
StringBuffer buffer;
|
|
|
|
|
bool isFirstCharacter = true;
|
|
|
|
|
if (oneBytes > 0) {
|
|
|
|
|
var firstPart = String.fromCharCodes(codeUnits, start, start + oneBytes);
|
|
|
|
|
start += oneBytes;
|
|
|
|
|
if (start == end) {
|
|
|
|
|
return firstPart;
|
|
|
|
|
}
|
|
|
|
|
buffer = StringBuffer(firstPart);
|
|
|
|
|
isFirstCharacter = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
buffer ??= StringBuffer();
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var decoder = _Utf8Decoder(buffer, _allowMalformed);
|
2019-02-20 17:19:14 +00:00
|
|
|
|
decoder._isFirstCharacter = isFirstCharacter;
|
2014-11-20 11:23:51 +00:00
|
|
|
|
decoder.convert(codeUnits, start, end);
|
2017-01-23 13:18:40 +00:00
|
|
|
|
decoder.flush(codeUnits, end);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
return buffer.toString();
|
|
|
|
|
}
|
2013-07-25 13:26:09 +00:00
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Starts a chunked conversion.
|
|
|
|
|
///
|
|
|
|
|
/// The converter works more efficiently if the given [sink] is a
|
|
|
|
|
/// [StringConversionSink].
|
2014-03-20 10:13:01 +00:00
|
|
|
|
ByteConversionSink startChunkedConversion(Sink<String> sink) {
|
2013-07-25 13:26:09 +00:00
|
|
|
|
StringConversionSink stringSink;
|
|
|
|
|
if (sink is StringConversionSink) {
|
|
|
|
|
stringSink = sink;
|
|
|
|
|
} else {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
stringSink = StringConversionSink.from(sink);
|
2013-07-25 13:26:09 +00:00
|
|
|
|
}
|
|
|
|
|
return stringSink.asUtf8Sink(_allowMalformed);
|
|
|
|
|
}
|
2013-07-26 10:35:42 +00:00
|
|
|
|
|
|
|
|
|
// Override the base-classes bind, to provide a better type.
|
|
|
|
|
Stream<String> bind(Stream<List<int>> stream) => super.bind(stream);
|
2014-11-04 10:25:41 +00:00
|
|
|
|
|
2017-01-03 11:05:40 +00:00
|
|
|
|
external Converter<List<int>, T> fuse<T>(Converter<String, T> next);
|
2015-01-21 01:29:14 +00:00
|
|
|
|
|
|
|
|
|
external static String _convertIntercepted(
|
|
|
|
|
bool allowMalformed, List<int> codeUnits, int start, int end);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// UTF-8 constants.
|
2017-03-16 17:22:19 +00:00
|
|
|
|
const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits
|
|
|
|
|
const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits
|
|
|
|
|
const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits
|
|
|
|
|
const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.
|
2013-07-17 11:35:12 +00:00
|
|
|
|
|
|
|
|
|
// UTF-16 constants.
|
|
|
|
|
const int _SURROGATE_TAG_MASK = 0xFC00;
|
|
|
|
|
const int _SURROGATE_VALUE_MASK = 0x3FF;
|
|
|
|
|
const int _LEAD_SURROGATE_MIN = 0xD800;
|
|
|
|
|
const int _TAIL_SURROGATE_MIN = 0xDC00;
|
|
|
|
|
|
|
|
|
|
bool _isLeadSurrogate(int codeUnit) =>
|
|
|
|
|
(codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
|
|
|
|
|
bool _isTailSurrogate(int codeUnit) =>
|
|
|
|
|
(codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
|
|
|
|
|
int _combineSurrogatePair(int lead, int tail) =>
|
2017-03-16 17:22:19 +00:00
|
|
|
|
0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) |
|
|
|
|
|
(tail & _SURROGATE_VALUE_MASK);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Decodes UTF-8.
|
|
|
|
|
///
|
|
|
|
|
/// The decoder handles chunked input.
|
2013-07-17 11:35:12 +00:00
|
|
|
|
// TODO(floitsch): make this class public.
|
|
|
|
|
class _Utf8Decoder {
|
|
|
|
|
final bool _allowMalformed;
|
2013-07-25 13:26:09 +00:00
|
|
|
|
final StringSink _stringSink;
|
2013-07-17 11:35:12 +00:00
|
|
|
|
bool _isFirstCharacter = true;
|
|
|
|
|
int _value = 0;
|
|
|
|
|
int _expectedUnits = 0;
|
|
|
|
|
int _extraUnits = 0;
|
|
|
|
|
|
2013-07-25 13:26:09 +00:00
|
|
|
|
_Utf8Decoder(this._stringSink, this._allowMalformed);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
|
|
|
|
|
bool get hasPartialInput => _expectedUnits > 0;
|
|
|
|
|
|
|
|
|
|
// Limits of one through four byte encodings.
|
2018-10-26 23:05:24 +00:00
|
|
|
|
static const List<int> _LIMITS = <int>[
|
2017-03-16 17:22:19 +00:00
|
|
|
|
_ONE_BYTE_LIMIT,
|
|
|
|
|
_TWO_BYTE_LIMIT,
|
|
|
|
|
_THREE_BYTE_LIMIT,
|
|
|
|
|
_FOUR_BYTE_LIMIT
|
|
|
|
|
];
|
2013-07-17 11:35:12 +00:00
|
|
|
|
|
2013-07-25 13:26:09 +00:00
|
|
|
|
void close() {
|
|
|
|
|
flush();
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
/// Flushes this decoder as if closed.
|
|
|
|
|
///
|
|
|
|
|
/// This method throws if the input was partial and the decoder was
|
|
|
|
|
/// constructed with `allowMalformed` set to `false`.
|
|
|
|
|
///
|
|
|
|
|
/// The [source] and [offset] of the current position may be provided,
|
|
|
|
|
/// and are included in the exception if one is thrown.
|
2017-01-23 13:18:40 +00:00
|
|
|
|
void flush([List<int> source, int offset]) {
|
2013-07-17 11:35:12 +00:00
|
|
|
|
if (hasPartialInput) {
|
|
|
|
|
if (!_allowMalformed) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
throw FormatException(
|
2017-03-16 17:22:19 +00:00
|
|
|
|
"Unfinished UTF-8 octet sequence", source, offset);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
2017-11-14 12:59:14 +00:00
|
|
|
|
_stringSink.writeCharCode(unicodeReplacementCharacterRune);
|
2013-07-25 13:26:09 +00:00
|
|
|
|
_value = 0;
|
|
|
|
|
_expectedUnits = 0;
|
|
|
|
|
_extraUnits = 0;
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-07-25 13:26:09 +00:00
|
|
|
|
void convert(List<int> codeUnits, int startIndex, int endIndex) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var value = _value;
|
|
|
|
|
var expectedUnits = _expectedUnits;
|
|
|
|
|
var extraUnits = _extraUnits;
|
2013-07-17 11:35:12 +00:00
|
|
|
|
_value = 0;
|
|
|
|
|
_expectedUnits = 0;
|
|
|
|
|
_extraUnits = 0;
|
|
|
|
|
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var i = startIndex;
|
2017-03-16 17:22:19 +00:00
|
|
|
|
loop:
|
|
|
|
|
while (true) {
|
|
|
|
|
multibyte:
|
|
|
|
|
if (expectedUnits > 0) {
|
2013-07-17 11:35:12 +00:00
|
|
|
|
do {
|
|
|
|
|
if (i == endIndex) {
|
|
|
|
|
break loop;
|
|
|
|
|
}
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var unit = codeUnits[i];
|
2013-07-17 11:35:12 +00:00
|
|
|
|
if ((unit & 0xC0) != 0x80) {
|
|
|
|
|
expectedUnits = 0;
|
|
|
|
|
if (!_allowMalformed) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
throw FormatException(
|
2017-01-23 13:18:40 +00:00
|
|
|
|
"Bad UTF-8 encoding 0x${unit.toRadixString(16)}",
|
2017-03-16 17:22:19 +00:00
|
|
|
|
codeUnits,
|
|
|
|
|
i);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
_isFirstCharacter = false;
|
2017-11-14 12:59:14 +00:00
|
|
|
|
_stringSink.writeCharCode(unicodeReplacementCharacterRune);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
break multibyte;
|
|
|
|
|
} else {
|
|
|
|
|
value = (value << 6) | (unit & 0x3f);
|
|
|
|
|
expectedUnits--;
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
} while (expectedUnits > 0);
|
|
|
|
|
if (value <= _LIMITS[extraUnits - 1]) {
|
|
|
|
|
// Overly long encoding. The value could be encoded with a shorter
|
|
|
|
|
// encoding.
|
|
|
|
|
if (!_allowMalformed) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
throw FormatException(
|
2017-01-23 13:18:40 +00:00
|
|
|
|
"Overlong encoding of 0x${value.toRadixString(16)}",
|
2017-03-16 17:22:19 +00:00
|
|
|
|
codeUnits,
|
|
|
|
|
i - extraUnits - 1);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
expectedUnits = extraUnits = 0;
|
2017-11-14 12:59:14 +00:00
|
|
|
|
value = unicodeReplacementCharacterRune;
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
if (value > _FOUR_BYTE_LIMIT) {
|
|
|
|
|
if (!_allowMalformed) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
throw FormatException(
|
2017-03-16 17:22:19 +00:00
|
|
|
|
"Character outside valid Unicode range: "
|
|
|
|
|
"0x${value.toRadixString(16)}",
|
|
|
|
|
codeUnits,
|
|
|
|
|
i - extraUnits - 1);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
2017-11-14 12:59:14 +00:00
|
|
|
|
value = unicodeReplacementCharacterRune;
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
2017-11-14 12:59:14 +00:00
|
|
|
|
if (!_isFirstCharacter || value != unicodeBomCharacterRune) {
|
2013-07-25 13:26:09 +00:00
|
|
|
|
_stringSink.writeCharCode(value);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
_isFirstCharacter = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while (i < endIndex) {
|
2018-11-28 10:43:32 +00:00
|
|
|
|
var oneBytes = _scanOneByteCharacters(codeUnits, i, endIndex);
|
2014-07-31 12:23:11 +00:00
|
|
|
|
if (oneBytes > 0) {
|
|
|
|
|
_isFirstCharacter = false;
|
2019-02-20 17:19:14 +00:00
|
|
|
|
assert(i + oneBytes <= endIndex);
|
|
|
|
|
_stringSink.write(String.fromCharCodes(codeUnits, i, i + oneBytes));
|
|
|
|
|
|
2014-07-31 12:23:11 +00:00
|
|
|
|
i += oneBytes;
|
|
|
|
|
if (i == endIndex) break;
|
|
|
|
|
}
|
2018-10-26 23:05:24 +00:00
|
|
|
|
var unit = codeUnits[i++];
|
2013-08-22 15:40:41 +00:00
|
|
|
|
// TODO(floitsch): the way we test we could potentially allow
|
|
|
|
|
// units that are too large, if they happen to have the
|
|
|
|
|
// right bit-pattern. (Same is true for the multibyte loop above).
|
|
|
|
|
// TODO(floitsch): optimize this loop. See:
|
|
|
|
|
// https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.dart?column_width=80
|
|
|
|
|
if (unit < 0) {
|
|
|
|
|
// TODO(floitsch): should this be unit <= 0 ?
|
|
|
|
|
if (!_allowMalformed) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
throw FormatException(
|
2017-01-23 13:18:40 +00:00
|
|
|
|
"Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}",
|
2017-03-16 17:22:19 +00:00
|
|
|
|
codeUnits,
|
|
|
|
|
i - 1);
|
2013-08-22 15:40:41 +00:00
|
|
|
|
}
|
2017-11-14 12:59:14 +00:00
|
|
|
|
_stringSink.writeCharCode(unicodeReplacementCharacterRune);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
} else {
|
2014-07-31 12:23:11 +00:00
|
|
|
|
assert(unit > _ONE_BYTE_LIMIT);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
if ((unit & 0xE0) == 0xC0) {
|
|
|
|
|
value = unit & 0x1F;
|
|
|
|
|
expectedUnits = extraUnits = 1;
|
|
|
|
|
continue loop;
|
|
|
|
|
}
|
|
|
|
|
if ((unit & 0xF0) == 0xE0) {
|
|
|
|
|
value = unit & 0x0F;
|
|
|
|
|
expectedUnits = extraUnits = 2;
|
|
|
|
|
continue loop;
|
|
|
|
|
}
|
|
|
|
|
// 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
|
|
|
|
|
if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
|
|
|
|
|
value = unit & 0x07;
|
|
|
|
|
expectedUnits = extraUnits = 3;
|
|
|
|
|
continue loop;
|
|
|
|
|
}
|
|
|
|
|
if (!_allowMalformed) {
|
2018-10-26 23:05:24 +00:00
|
|
|
|
throw FormatException(
|
2017-01-23 13:18:40 +00:00
|
|
|
|
"Bad UTF-8 encoding 0x${unit.toRadixString(16)}",
|
2017-03-16 17:22:19 +00:00
|
|
|
|
codeUnits,
|
|
|
|
|
i - 1);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
2017-11-14 12:59:14 +00:00
|
|
|
|
value = unicodeReplacementCharacterRune;
|
2013-07-17 11:35:12 +00:00
|
|
|
|
expectedUnits = extraUnits = 0;
|
|
|
|
|
_isFirstCharacter = false;
|
2013-07-25 13:26:09 +00:00
|
|
|
|
_stringSink.writeCharCode(value);
|
2013-07-17 11:35:12 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break loop;
|
|
|
|
|
}
|
|
|
|
|
if (expectedUnits > 0) {
|
|
|
|
|
_value = value;
|
|
|
|
|
_expectedUnits = expectedUnits;
|
|
|
|
|
_extraUnits = extraUnits;
|
|
|
|
|
}
|
|
|
|
|
}
|
2013-07-15 10:26:38 +00:00
|
|
|
|
}
|
2018-11-28 10:43:32 +00:00
|
|
|
|
|
|
|
|
|
// Returns the number of bytes in [units] starting at offset [from] which have
|
|
|
|
|
// the leftmost bit set to 0.
|
|
|
|
|
//
|
|
|
|
|
// To increase performance of this critical method we have a special variant of
|
|
|
|
|
// it implemented in the VM's patch files, which is why we make it external.
|
|
|
|
|
external int _scanOneByteCharacters(List<int> units, int from, int endIndex);
|