[dart2wasm] Optimize UTF8 decoder when decoding a native array

Intercept all `Utf8Decoder.convert` calls to check the input type and
call the right UTF8 decoder function.

This currently only optimizes `U8List` parsing, and only the fastest
path where the input is an ASCII string. Follow-up CLs will optimize JS
typed array parsing by calling browser's UTF8 decoder and the slow paths
of non-ASCII and malformed buffers.

Results from an internal benchmark extracted from a real use case:

Before:

    Parse(RunTime):     330,500 us.

After:

    Parse(RunTime):     90,310 us.  (-72%)

dart2js -O4 results of the same benchmark:

    Parse(RunTime):     234,250 us.

Change-Id: I470697a9eb516e1e031e9b865ddb29e8f25569bf
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/331187
Commit-Queue: Ömer Ağacan <omersa@google.com>
Reviewed-by: Jackson Gardner <jacksongardner@google.com>
This commit is contained in:
Ömer Sinan Ağacan 2023-10-23 18:36:52 +00:00 committed by Commit Queue
parent 0a436b7005
commit 405a03909c
2 changed files with 59 additions and 22 deletions

View file

@ -4,6 +4,7 @@
import "dart:_internal" show patch, POWERS_OF_TEN, unsafeCast;
import "dart:_string";
import "dart:_typed_data";
import "dart:typed_data" show Uint8List, Uint16List;
/// This patch library has no additional parts.
@ -27,7 +28,7 @@ class Utf8Decoder {
@patch
Converter<List<int>, T> fuse<T>(Converter<String, T> next) {
if (next is JsonDecoder) {
return new _JsonUtf8Decoder(
return _JsonUtf8Decoder(
(next as JsonDecoder)._reviver, this._allowMalformed)
as dynamic/*=Converter<List<int>, T>*/;
}
@ -35,11 +36,49 @@ class Utf8Decoder {
return super.fuse<T>(next);
}
// Allow intercepting of UTF-8 decoding when built-in lists are passed.
@patch
static String? _convertIntercepted(
bool allowMalformed, List<int> codeUnits, int start, int? end) {
return null; // This call was not intercepted.
// We intercept the calls always to make sure the standard library UTF8
// decoder is only passed `U8List`, so that array accesses will be
// monomorphic and inlined.
if (codeUnits is U8List) {
return _Utf8Decoder(allowMalformed)._convertSingle(
unsafeCast<U8List>(codeUnits), start, end, codeUnits, start);
} else {
// TODO(omersa): Check if `codeUnits` is a JS array and call browser UTF8
// decoder here.
//
// If we're passed a `List<int>` other than `U8List` or a JS typed array,
// it means the performance is not too important. So we convert the input
// to `U8List` to avoid shipping another UTF8 decoder.
end ??= codeUnits.length;
final length = end - start;
final u8list = U8List(length);
final u8listData = u8list.data;
if (allowMalformed) {
int u8listIdx = 0;
for (int codeUnitsIdx = start; codeUnitsIdx < end; codeUnitsIdx += 1) {
int byte = codeUnits[codeUnitsIdx];
if (byte < 0 || byte > 255) {
byte = 0xFF;
}
u8listData.write(u8listIdx++, byte);
}
} else {
int u8listIdx = 0;
for (int codeUnitsIdx = start; codeUnitsIdx < end; codeUnitsIdx += 1) {
final byte = codeUnits[codeUnitsIdx];
if (byte < 0 || byte > 255) {
throw FormatException(
'Invalid UTF-8 byte', codeUnits, codeUnitsIdx);
}
u8listData.write(u8listIdx++, byte);
}
}
return _Utf8Decoder(allowMalformed)
._convertSingle(u8list, 0, length, codeUnits, start);
}
}
}
@ -1364,7 +1403,7 @@ abstract class _ChunkedJsonParser<T> {
message = "Unexpected character";
if (position == chunkEnd) message = "Unexpected end of input";
}
throw new FormatException(message, chunk, position);
throw FormatException(message, chunk, position);
}
}
@ -1469,7 +1508,7 @@ class _JsonStringDecoderSink extends StringConversionSinkBase {
* Chunked JSON parser that parses UTF-8 chunks.
*/
class _JsonUtf8Parser extends _ChunkedJsonParser<List<int>> {
static final Uint8List emptyChunk = Uint8List(0);
static final U8List emptyChunk = U8List(0);
final _Utf8Decoder decoder;
List<int> chunk = emptyChunk;
@ -1674,20 +1713,14 @@ class _Utf8Decoder {
@patch
String convertSingle(List<int> codeUnits, int start, int? maybeEnd) {
int end = RangeError.checkValidRange(start, maybeEnd, codeUnits.length);
// `Utf8Decoder._convertIntercepted` should intercept all calls to call the
// right decoder for the `codeUnits` type.
throw 'Utf8Decoder.convert was not intercepted';
}
// Have bytes as Uint8List.
Uint8List bytes;
int errorOffset;
if (codeUnits is Uint8List) {
bytes = unsafeCast<Uint8List>(codeUnits);
errorOffset = 0;
} else {
bytes = _makeUint8List(codeUnits, start, end);
errorOffset = start;
end -= start;
start = 0;
}
String _convertSingle(U8List bytes, int start, int? maybeEnd,
List<int> actualSource, int actualStart) {
final int end = RangeError.checkValidRange(start, maybeEnd, bytes.length);
// Skip initial BOM.
start = skipBomSingle(bytes, start, end);
@ -1696,14 +1729,14 @@ class _Utf8Decoder {
if (start == end) return "";
// Scan input to determine size and appropriate decoder.
int size = scan(bytes, start, end);
int flags = _scanFlags;
final int size = scan(bytes, start, end);
final int flags = _scanFlags;
if (flags == 0) {
// Pure ASCII.
assert(size == end - start);
OneByteString result = OneByteString.withLength(size);
copyRangeFromUint8ListToOneByteString(bytes, result, start, 0, size);
oneByteStringArray(result).copy(0, bytes.data, start, size);
return result;
}
@ -1726,7 +1759,7 @@ class _Utf8Decoder {
_charOrIndex = end;
}
final String message = errorDescription(_state);
throw FormatException(message, codeUnits, errorOffset + _charOrIndex);
throw FormatException(message, actualSource, actualStart + _charOrIndex);
}
// Start over on slow path.

View file

@ -31,6 +31,10 @@ void writeIntoOneByteString(OneByteString s, int index, int codePoint) =>
void writeIntoTwoByteString(TwoByteString s, int index, int codePoint) =>
s._setAt(index, codePoint);
/// Static function for `OneByteString._array` to avoid making `_array` public.
@pragma('wasm:prefer-inline')
WasmIntArray<WasmI8> oneByteStringArray(OneByteString s) => s._array;
/// The [fromStart] and [toStart] indices together with the [length] must
/// specify ranges within the bounds of the list / string.
void copyRangeFromUint8ListToOneByteString(