Replace old utf8 decoder with new one.

R=lrn@google.com

Review URL: https://codereview.chromium.org//19187002

git-svn-id: https://dart.googlecode.com/svn/branches/bleeding_edge/dart@25089 260f80e4-7a28-3924-810f-c04153c831b5
This commit is contained in:
floitsch@google.com 2013-07-17 11:35:12 +00:00
parent 912b741073
commit d0768e543f
5 changed files with 441 additions and 7 deletions

View file

@ -20,11 +20,41 @@ const UTF8 = const Utf8Codec();
* A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes
* UTF-8 code units to strings.
*/
// TODO(floitsch): Needs a way to specify if decoding should throw or use
// the replacement character.
class Utf8Codec extends Encoding {
const Utf8Codec();
final bool _allowMalformed;
/**
* Instantiates a new [Utf8Codec].
*
* The optional [allowMalformed] argument defines how [decoder] (and [decode])
* deal with invalid or unterminated character sequences.
*
* If it is `true` (and not overriden at the method invocation) [decode] and
* the [decoder] replace invalid (or unterminated) octet
* sequences with the Unicode Replacement character `U+FFFD` (<EFBFBD>). Otherwise
* they throw a [FormatException].
*/
const Utf8Codec({ bool allowMalformed: false })
: _allowMalformed = allowMalformed;
/**
* Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
* corresponding string.
*
* If [allowMalformed] is `true` the decoder replaces invalid (or
* unterminated) character sequences with the Unicode Replacement character
* `U+FFFD` (<EFBFBD>). Otherwise it throws a [FormatException].
*
* If [allowMalformed] is not given, it defaults to the `allowMalformed` that
* was used to instantiate `this`.
*/
String decode(List<int> codeUnits, { bool allowMalformed }) {
if (allowMalformed == null) allowMalformed = _allowMalformed;
return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
}
Converter<String, List<int>> get encoder => new Utf8Encoder();
Converter<List<int>, String> get decoder => new Utf8Decoder();
Converter<List<int>, String> get decoder {
return new Utf8Decoder(allowMalformed: _allowMalformed);
}
}

View file

@ -21,11 +21,187 @@ class Utf8Encoder extends Converter<String, List<int>> {
* to a string.
*/
class Utf8Decoder extends Converter<List<int>, String> {
final bool _allowMalformed;
/**
* Instantiates a new [Utf8Decoder].
*
* The optional [allowMalformed] argument defines how [convert] deals
* with invalid or unterminated character sequences.
*
* If it is `true` [convert] replaces invalid (or unterminated) character
* sequences with the Unicode Replacement character `U+FFFD` (<EFBFBD>). Otherwise
* it throws a [FormatException].
*/
Utf8Decoder({ bool allowMalformed: false })
: this._allowMalformed = allowMalformed;
/**
* Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
* corresponding string.
*/
// TODO(floitsch): allow to configure the decoder (for example the replacement
// character).
String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits);
String convert(List<int> codeUnits) {
StringBuffer buffer = new StringBuffer();
_Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
decoder.convert(codeUnits, 0, codeUnits.length, buffer);
decoder.close(buffer);
return buffer.toString();
}
}
// UTF-8 constants.
const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
// UTF-16 constants.
const int _SURROGATE_MASK = 0xF800;
const int _SURROGATE_TAG_MASK = 0xFC00;
const int _SURROGATE_VALUE_MASK = 0x3FF;
const int _LEAD_SURROGATE_MIN = 0xD800;
const int _TAIL_SURROGATE_MIN = 0xDC00;
const int _REPLACEMENT_CHARACTER = 0xFFFD;
const int _BOM_CHARACTER = 0xFEFF;
bool _isSurrogate(int codeUnit) =>
(codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
bool _isLeadSurrogate(int codeUnit) =>
(codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
bool _isTailSurrogate(int codeUnit) =>
(codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
int _combineSurrogatePair(int lead, int tail) =>
0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
| (tail & _SURROGATE_VALUE_MASK);
/**
* Decodes UTF-8.
*
* The decoder handles chunked input.
*/
// TODO(floitsch): make this class public.
class _Utf8Decoder {
final bool _allowMalformed;
bool _isFirstCharacter = true;
int _value = 0;
int _expectedUnits = 0;
int _extraUnits = 0;
_Utf8Decoder(this._allowMalformed);
bool get hasPartialInput => _expectedUnits > 0;
// Limits of one through four byte encodings.
static const List<int> _LIMITS = const <int>[
_ONE_BYTE_LIMIT,
_TWO_BYTE_LIMIT,
_THREE_BYTE_LIMIT,
_FOUR_BYTE_LIMIT ];
void close(StringSink sink) {
if (hasPartialInput) {
if (!_allowMalformed) {
throw new FormatException("Unfinished UTF-8 octet sequence");
}
sink.writeCharCode(_REPLACEMENT_CHARACTER);
}
}
void convert(List<int> codeUnits, int startIndex, int endIndex,
StringSink sink) {
int value = _value;
int expectedUnits = _expectedUnits;
int extraUnits = _extraUnits;
_value = 0;
_expectedUnits = 0;
_extraUnits = 0;
int i = startIndex;
loop: while (true) {
multibyte: if (expectedUnits > 0) {
do {
if (i == endIndex) {
break loop;
}
int unit = codeUnits[i];
if ((unit & 0xC0) != 0x80) {
expectedUnits = 0;
if (!_allowMalformed) {
throw new FormatException(
"Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
}
_isFirstCharacter = false;
sink.writeCharCode(_REPLACEMENT_CHARACTER);
break multibyte;
} else {
value = (value << 6) | (unit & 0x3f);
expectedUnits--;
i++;
}
} while (expectedUnits > 0);
if (value <= _LIMITS[extraUnits - 1]) {
// Overly long encoding. The value could be encoded with a shorter
// encoding.
if (!_allowMalformed) {
throw new FormatException(
"Overlong encoding of 0x${value.toRadixString(16)}");
}
expectedUnits = extraUnits = 0;
value = _REPLACEMENT_CHARACTER;
}
if (value > _FOUR_BYTE_LIMIT) {
if (!_allowMalformed) {
throw new FormatException("Character outside valid Unicode range: "
"0x${value.toRadixString(16)}");
}
value = _REPLACEMENT_CHARACTER;
}
if (!_isFirstCharacter || value != _BOM_CHARACTER) {
sink.writeCharCode(value);
}
_isFirstCharacter = false;
}
while (i < endIndex) {
int unit = codeUnits[i++];
if (unit <= _ONE_BYTE_LIMIT) {
_isFirstCharacter = false;
sink.writeCharCode(unit);
} else {
if ((unit & 0xE0) == 0xC0) {
value = unit & 0x1F;
expectedUnits = extraUnits = 1;
continue loop;
}
if ((unit & 0xF0) == 0xE0) {
value = unit & 0x0F;
expectedUnits = extraUnits = 2;
continue loop;
}
// 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
value = unit & 0x07;
expectedUnits = extraUnits = 3;
continue loop;
}
if (!_allowMalformed) {
throw new FormatException(
"Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
}
value = _REPLACEMENT_CHARACTER;
expectedUnits = extraUnits = 0;
_isFirstCharacter = false;
sink.writeCharCode(value);
}
}
break loop;
}
if (expectedUnits > 0) {
_value = value;
_expectedUnits = expectedUnits;
_extraUnits = extraUnits;
}
}
}

View file

@ -0,0 +1,135 @@
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library utf8_test;
import "package:expect/expect.dart";
import 'dart:convert';
import 'dart:codec';
String decode(List<int> bytes) => new Utf8Decoder().convert(bytes);
String decodeAllowMalformed(List<int> bytes) {
return new Utf8Decoder(allowMalformed: true).convert(bytes);
}
String decode2(List<int> bytes) => UTF8.decode(bytes);
String decodeAllowMalformed2(List<int> bytes) {
return UTF8.decode(bytes, allowMalformed: true);
}
String decode3(List<int> bytes) => new Utf8Codec().decode(bytes);
String decodeAllowMalformed3(List<int> bytes) {
return new Utf8Codec(allowMalformed: true).decode(bytes);
}
String decode4(List<int> bytes) => new Utf8Codec().decoder.convert(bytes);
String decodeAllowMalformed4(List<int> bytes) {
return new Utf8Codec(allowMalformed: true).decoder.convert(bytes);
}
final TESTS = [
// Unfinished UTF-8 sequences.
[ 0xc3 ],
[ 0xE2, 0x82 ],
[ 0xF0, 0xA4, 0xAD ],
// Overlong encoding of euro-sign.
[ 0xF0, 0x82, 0x82, 0xAC ],
// Other overlong/unfinished sequences.
[ 0xC0 ],
[ 0xC1 ],
[ 0xF5 ],
[ 0xF6 ],
[ 0xF7 ],
[ 0xF8 ],
[ 0xF9 ],
[ 0xFA ],
[ 0xFB ],
[ 0xFC ],
[ 0xFD ],
[ 0xFE ],
[ 0xFF ],
[ 0xC0, 0x80 ],
[ 0xC1, 0x80 ],
// Outside valid range.
[ 0xF4, 0xBF, 0xBF, 0xBF ]];
final TESTS2 = [
// Test that 0xC0|1, 0x80 does not eat the next character.
[[ 0xC0, 0x80, 0x61 ], "Xa" ],
[[ 0xC1, 0x80, 0x61 ], "Xa" ],
// 0xF5 .. 0xFF never appear in valid UTF-8 sequences.
[[ 0xF5, 0x80 ], "XX" ],
[[ 0xF6, 0x80 ], "XX" ],
[[ 0xF7, 0x80 ], "XX" ],
[[ 0xF8, 0x80 ], "XX" ],
[[ 0xF9, 0x80 ], "XX" ],
[[ 0xFA, 0x80 ], "XX" ],
[[ 0xFB, 0x80 ], "XX" ],
[[ 0xFC, 0x80 ], "XX" ],
[[ 0xFD, 0x80 ], "XX" ],
[[ 0xFE, 0x80 ], "XX" ],
[[ 0xFF, 0x80 ], "XX" ],
[[ 0xF5, 0x80, 0x61 ], "XXa" ],
[[ 0xF6, 0x80, 0x61 ], "XXa" ],
[[ 0xF7, 0x80, 0x61 ], "XXa" ],
[[ 0xF8, 0x80, 0x61 ], "XXa" ],
[[ 0xF9, 0x80, 0x61 ], "XXa" ],
[[ 0xFA, 0x80, 0x61 ], "XXa" ],
[[ 0xFB, 0x80, 0x61 ], "XXa" ],
[[ 0xFC, 0x80, 0x61 ], "XXa" ],
[[ 0xFD, 0x80, 0x61 ], "XXa" ],
[[ 0xFE, 0x80, 0x61 ], "XXa" ],
[[ 0xFF, 0x80, 0x61 ], "XXa" ],
// Characters outside the valid range.
[[ 0xF5, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xF6, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xF7, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xF8, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xF9, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xFA, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xFB, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xFC, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xFD, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xFE, 0x80, 0x80, 0x61 ], "XXXa" ],
[[ 0xFF, 0x80, 0x80, 0x61 ], "XXXa" ]];
main() {
var allTests = TESTS.expand((test) {
// Pairs of test and expected string output when malformed strings are
// allowed. Replacement character: U+FFFD
return [[ test, "\u{FFFD}" ],
[ new List.from([0x61])..addAll(test), "a\u{FFFD}" ],
[ new List.from([0x61])..addAll(test)..add(0x61), "a\u{FFFD}a" ],
[ new List.from(test)..add(0x61), "\u{FFFD}a" ],
[ new List.from(test)..addAll(test), "\u{FFFD}\u{FFFD}" ],
[ new List.from(test)..add(0x61)..addAll(test),
"\u{FFFD}a\u{FFFD}" ],
[ new List.from([0xc3, 0xa5])..addAll(test), "å\u{FFFD}" ],
[ new List.from([0xc3, 0xa5])..addAll(test)..addAll([0xc3, 0xa5]),
"å\u{FFFD}å" ],
[ new List.from(test)..addAll([0xc3, 0xa5]), "\u{FFFD}å" ],
[ new List.from(test)..addAll([0xc3, 0xa5])..addAll(test),
"\u{FFFD}å\u{FFFD}" ]];
});
var allTests2 = TESTS2.map((test) {
// Pairs of test and expected string output when malformed strings are
// allowed. Replacement character: U+FFFD
String expected = test[1].replaceAll("X", "\u{FFFD}");
return [test[0], expected];
});
for (var test in []..addAll(allTests)..addAll(allTests2)) {
List<int> bytes = test[0];
Expect.throws(() => decode(bytes), (e) => e is FormatException);
Expect.throws(() => decode2(bytes), (e) => e is FormatException);
Expect.throws(() => decode3(bytes), (e) => e is FormatException);
Expect.throws(() => decode4(bytes), (e) => e is FormatException);
String expected = test[1];
Expect.equals(expected, decodeAllowMalformed(bytes));
Expect.equals(expected, decodeAllowMalformed2(bytes));
Expect.equals(expected, decodeAllowMalformed3(bytes));
Expect.equals(expected, decodeAllowMalformed4(bytes));
}
}

View file

@ -0,0 +1,46 @@
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library utf8_test;
import "package:expect/expect.dart";
import 'dart:codec';
import 'dart:convert';
main() {
// Test that UTF8-decoder removes leading BOM.
Expect.equals("a", UTF8.decode([0xEF, 0xBB, 0xBF, 0x61]));
Expect.equals("a", UTF8.decoder.convert([0xEF, 0xBB, 0xBF, 0x61]));
Expect.equals("a", new Utf8Decoder().convert([0xEF, 0xBB, 0xBF, 0x61]));
Expect.equals("a",
UTF8.decode([0xEF, 0xBB, 0xBF, 0x61], allowMalformed: true));
Expect.equals("a", new Utf8Codec(allowMalformed: true)
.decode([0xEF, 0xBB, 0xBF, 0x61]));
Expect.equals("a", new Utf8Codec(allowMalformed: true)
.decoder.convert([0xEF, 0xBB, 0xBF, 0x61]));
Expect.equals("a", new Utf8Decoder(allowMalformed: true)
.convert([0xEF, 0xBB, 0xBF, 0x61]));
Expect.equals("", UTF8.decode([0xEF, 0xBB, 0xBF]));
Expect.equals("", UTF8.decoder.convert([0xEF, 0xBB, 0xBF]));
Expect.equals("", new Utf8Decoder().convert([0xEF, 0xBB, 0xBF]));
Expect.equals("",
UTF8.decode([0xEF, 0xBB, 0xBF], allowMalformed: true));
Expect.equals("", new Utf8Codec(allowMalformed: true)
.decode([0xEF, 0xBB, 0xBF]));
Expect.equals("", new Utf8Codec(allowMalformed: true)
.decoder.convert([0xEF, 0xBB, 0xBF]));
Expect.equals("", new Utf8Decoder(allowMalformed: true)
.convert([0xEF, 0xBB, 0xBF]));
Expect.equals("a\u{FEFF}", UTF8.decode([0x61, 0xEF, 0xBB, 0xBF]));
Expect.equals("a\u{FEFF}", UTF8.decoder.convert([0x61, 0xEF, 0xBB, 0xBF]));
Expect.equals("a\u{FEFF}",
new Utf8Decoder().convert([0x61, 0xEF, 0xBB, 0xBF]));
Expect.equals("a\u{FEFF}",
UTF8.decode([0x61, 0xEF, 0xBB, 0xBF], allowMalformed: true));
Expect.equals("a\u{FEFF}", new Utf8Codec(allowMalformed: true)
.decode([0x61, 0xEF, 0xBB, 0xBF]));
Expect.equals("a\u{FEFF}", new Utf8Codec(allowMalformed: true)
.decoder.convert([0x61, 0xEF, 0xBB, 0xBF]));
Expect.equals("a\u{FEFF}", new Utf8Decoder(allowMalformed: true)
.convert([0x61, 0xEF, 0xBB, 0xBF]));
}

View file

@ -0,0 +1,47 @@
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library utf8_test;
import "package:expect/expect.dart";
import 'dart:convert';
String decode(List<int> bytes) => new Utf8Decoder().convert(bytes);
main() {
// Google favorite: "Îñţérñåţîöñåļîžåţîờñ".
String string = decode([0xc3, 0x8e, 0xc3, 0xb1, 0xc5, 0xa3, 0xc3, 0xa9, 0x72,
0xc3, 0xb1, 0xc3, 0xa5, 0xc5, 0xa3, 0xc3, 0xae, 0xc3,
0xb6, 0xc3, 0xb1, 0xc3, 0xa5, 0xc4, 0xbc, 0xc3, 0xae,
0xc5, 0xbe, 0xc3, 0xa5, 0xc5, 0xa3, 0xc3, 0xae, 0xe1,
0xbb, 0x9d, 0xc3, 0xb1]);
Expect.stringEquals("Îñţérñåţîöñåļîžåţîờñ", string);
// Blueberry porridge in Danish: "blåbærgrød".
string = decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, 0x72, 0x67, 0x72,
0xc3, 0xb8, 0x64]);
Expect.stringEquals("blåbærgrød", string);
// "சிவா அணாமாைல", that is "Siva Annamalai" in Tamil.
string = decode([0xe0, 0xae, 0x9a, 0xe0, 0xae, 0xbf, 0xe0, 0xae, 0xb5, 0xe0,
0xae, 0xbe, 0x20, 0xe0, 0xae, 0x85, 0xe0, 0xae, 0xa3, 0xe0,
0xae, 0xbe, 0xe0, 0xae, 0xae, 0xe0, 0xae, 0xbe, 0xe0, 0xaf,
0x88, 0xe0, 0xae, 0xb2]);
Expect.stringEquals("சிவா அணாமாைல", string);
// "िसवा अणामालै", that is "Siva Annamalai" in Devanagari.
string = decode([0xe0, 0xa4, 0xbf, 0xe0, 0xa4, 0xb8, 0xe0, 0xa4, 0xb5, 0xe0,
0xa4, 0xbe, 0x20, 0xe0, 0xa4, 0x85, 0xe0, 0xa4, 0xa3, 0xe0,
0xa4, 0xbe, 0xe0, 0xa4, 0xae, 0xe0, 0xa4, 0xbe, 0xe0, 0xa4,
0xb2, 0xe0, 0xa5, 0x88]);
Expect.stringEquals("िसवा अणामालै", string);
// DESERET CAPITAL LETTER BEE, unicode 0x10412(0xD801+0xDC12)
// UTF-8: F0 90 90 92
string = decode([0xf0, 0x90, 0x90, 0x92]);
Expect.equals(string.length, 2);
Expect.equals("𐐒".length, 2);
Expect.stringEquals("𐐒", string);
// TODO(ahe): Add tests of bad input.
}