Replace old utf8 decoder with new one.

R=lrn@google.com Review URL: https://codereview.chromium.org//19187002 git-svn-id: https://dart.googlecode.com/svn/branches/bleeding_edge/dart@25089 260f80e4-7a28-3924-810f-c04153c831b5
2024-09-21 08:11:28 +00:00 · 2013-07-17 11:35:12 +00:00 · 2013-07-17 11:35:12 +00:00 · d0768e543f
parent 912b741073
commit d0768e543f
5 changed files with 441 additions and 7 deletions
--- a/sdk/lib/codec/encoding.dart
+++ b/sdk/lib/codec/encoding.dart
@ -20,11 +20,41 @@ const UTF8 = const Utf8Codec();
 * A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes
 * UTF-8 code units to strings.
 */
-// TODO(floitsch): Needs a way to specify if decoding should throw or use
-// the replacement character.
 class Utf8Codec extends Encoding {
-  const Utf8Codec();
+  final bool _allowMalformed;
+
+  /**
+   * Instantiates a new [Utf8Codec].
+   *
+   * The optional [allowMalformed] argument defines how [decoder] (and [decode])
+   * deal with invalid or unterminated character sequences.
+   *
+   * If it is `true` (and not overriden at the method invocation) [decode] and
+   * the [decoder] replace invalid (or unterminated) octet
+   * sequences with the Unicode Replacement character `U+FFFD` (<EFBFBD>). Otherwise
+   * they throw a [FormatException].
+   */
+  const Utf8Codec({ bool allowMalformed: false })
+      : _allowMalformed = allowMalformed;
+
+  /**
+   * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
+   * corresponding string.
+   *
+   * If [allowMalformed] is `true` the decoder replaces invalid (or
+   * unterminated) character sequences with the Unicode Replacement character
+   * `U+FFFD` (<EFBFBD>). Otherwise it throws a [FormatException].
+   *
+   * If [allowMalformed] is not given, it defaults to the `allowMalformed` that
+   * was used to instantiate `this`.
+   */
+  String decode(List<int> codeUnits, { bool allowMalformed }) {
+    if (allowMalformed == null) allowMalformed = _allowMalformed;
+    return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
+  }

  Converter<String, List<int>> get encoder => new Utf8Encoder();
-  Converter<List<int>, String> get decoder => new Utf8Decoder();
+  Converter<List<int>, String> get decoder {
+    return new Utf8Decoder(allowMalformed: _allowMalformed);
+  }
 }
--- a/sdk/lib/convert/utf.dart
+++ b/sdk/lib/convert/utf.dart
@ -21,11 +21,187 @@ class Utf8Encoder extends Converter<String, List<int>> {
 * to a string.
 */
 class Utf8Decoder extends Converter<List<int>, String> {
+  final bool _allowMalformed;
+
+  /**
+   * Instantiates a new [Utf8Decoder].
+   *
+   * The optional [allowMalformed] argument defines how [convert] deals
+   * with invalid or unterminated character sequences.
+   *
+   * If it is `true` [convert] replaces invalid (or unterminated) character
+   * sequences with the Unicode Replacement character `U+FFFD` (<EFBFBD>). Otherwise
+   * it throws a [FormatException].
+   */
+  Utf8Decoder({ bool allowMalformed: false })
+      : this._allowMalformed = allowMalformed;
+
  /**
   * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
   * corresponding string.
   */
-  // TODO(floitsch): allow to configure the decoder (for example the replacement
-  // character).
-  String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits);
+  String convert(List<int> codeUnits) {
+    StringBuffer buffer = new StringBuffer();
+    _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
+    decoder.convert(codeUnits, 0, codeUnits.length, buffer);
+    decoder.close(buffer);
+    return buffer.toString();
+  }
+}
+
+// UTF-8 constants.
+const int _ONE_BYTE_LIMIT = 0x7f;   // 7 bytes
+const int _TWO_BYTE_LIMIT = 0x7ff;  // 11 bytes
+const int _THREE_BYTE_LIMIT = 0xffff;  // 16 bytes
+const int _FOUR_BYTE_LIMIT = 0x10ffff;  // 21 bytes, truncated to Unicode max.
+
+// UTF-16 constants.
+const int _SURROGATE_MASK = 0xF800;
+const int _SURROGATE_TAG_MASK = 0xFC00;
+const int _SURROGATE_VALUE_MASK = 0x3FF;
+const int _LEAD_SURROGATE_MIN = 0xD800;
+const int _TAIL_SURROGATE_MIN = 0xDC00;
+
+const int _REPLACEMENT_CHARACTER = 0xFFFD;
+const int _BOM_CHARACTER = 0xFEFF;
+
+bool _isSurrogate(int codeUnit) =>
+    (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
+bool _isLeadSurrogate(int codeUnit) =>
+    (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
+bool _isTailSurrogate(int codeUnit) =>
+    (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
+int _combineSurrogatePair(int lead, int tail) =>
+    0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
+            | (tail & _SURROGATE_VALUE_MASK);
+
+
+/**
+ * Decodes UTF-8.
+ *
+ * The decoder handles chunked input.
+ */
+// TODO(floitsch): make this class public.
+class _Utf8Decoder {
+  final bool _allowMalformed;
+  bool _isFirstCharacter = true;
+  int _value = 0;
+  int _expectedUnits = 0;
+  int _extraUnits = 0;
+
+  _Utf8Decoder(this._allowMalformed);
+
+  bool get hasPartialInput => _expectedUnits > 0;
+
+  // Limits of one through four byte encodings.
+  static const List<int> _LIMITS = const <int>[
+      _ONE_BYTE_LIMIT,
+      _TWO_BYTE_LIMIT,
+      _THREE_BYTE_LIMIT,
+      _FOUR_BYTE_LIMIT ];
+
+  void close(StringSink sink) {
+    if (hasPartialInput) {
+      if (!_allowMalformed) {
+        throw new FormatException("Unfinished UTF-8 octet sequence");
+      }
+      sink.writeCharCode(_REPLACEMENT_CHARACTER);
+    }
+  }
+
+  void convert(List<int> codeUnits, int startIndex, int endIndex,
+               StringSink sink) {
+    int value = _value;
+    int expectedUnits = _expectedUnits;
+    int extraUnits = _extraUnits;
+    _value = 0;
+    _expectedUnits = 0;
+    _extraUnits = 0;
+
+    int i = startIndex;
+    loop: while (true) {
+      multibyte: if (expectedUnits > 0) {
+        do {
+          if (i == endIndex) {
+            break loop;
+          }
+          int unit = codeUnits[i];
+          if ((unit & 0xC0) != 0x80) {
+            expectedUnits = 0;
+            if (!_allowMalformed) {
+              throw new FormatException(
+                  "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
+            }
+            _isFirstCharacter = false;
+            sink.writeCharCode(_REPLACEMENT_CHARACTER);
+            break multibyte;
+          } else {
+            value = (value << 6) | (unit & 0x3f);
+            expectedUnits--;
+            i++;
+          }
+        } while (expectedUnits > 0);
+        if (value <= _LIMITS[extraUnits - 1]) {
+          // Overly long encoding. The value could be encoded with a shorter
+          // encoding.
+          if (!_allowMalformed) {
+            throw new FormatException(
+                "Overlong encoding of 0x${value.toRadixString(16)}");
+          }
+          expectedUnits = extraUnits = 0;
+          value = _REPLACEMENT_CHARACTER;
+        }
+        if (value > _FOUR_BYTE_LIMIT) {
+          if (!_allowMalformed) {
+            throw new FormatException("Character outside valid Unicode range: "
+                                      "0x${value.toRadixString(16)}");
+          }
+          value = _REPLACEMENT_CHARACTER;
+        }
+        if (!_isFirstCharacter || value != _BOM_CHARACTER) {
+          sink.writeCharCode(value);
+        }
+        _isFirstCharacter = false;
+      }
+
+      while (i < endIndex) {
+        int unit = codeUnits[i++];
+        if (unit <= _ONE_BYTE_LIMIT) {
+          _isFirstCharacter = false;
+          sink.writeCharCode(unit);
+        } else {
+          if ((unit & 0xE0) == 0xC0) {
+            value = unit & 0x1F;
+            expectedUnits = extraUnits = 1;
+            continue loop;
+          }
+          if ((unit & 0xF0) == 0xE0) {
+            value = unit & 0x0F;
+            expectedUnits = extraUnits = 2;
+            continue loop;
+          }
+          // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
+          if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
+            value = unit & 0x07;
+            expectedUnits = extraUnits = 3;
+            continue loop;
+          }
+          if (!_allowMalformed) {
+            throw new FormatException(
+                "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
+          }
+          value = _REPLACEMENT_CHARACTER;
+          expectedUnits = extraUnits = 0;
+          _isFirstCharacter = false;
+          sink.writeCharCode(value);
+        }
+      }
+      break loop;
+    }
+    if (expectedUnits > 0) {
+      _value = value;
+      _expectedUnits = expectedUnits;
+      _extraUnits = extraUnits;
+    }
+  }
 }
--- a/tests/lib/convert/utf82_test.dart
+++ b/tests/lib/convert/utf82_test.dart
@ -0,0 +1,135 @@
+// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+library utf8_test;
+import "package:expect/expect.dart";
+import 'dart:convert';
+import 'dart:codec';
+
+String decode(List<int> bytes) => new Utf8Decoder().convert(bytes);
+String decodeAllowMalformed(List<int> bytes) {
+  return new Utf8Decoder(allowMalformed: true).convert(bytes);
+}
+
+String decode2(List<int> bytes) => UTF8.decode(bytes);
+String decodeAllowMalformed2(List<int> bytes) {
+  return UTF8.decode(bytes, allowMalformed: true);
+}
+
+String decode3(List<int> bytes) => new Utf8Codec().decode(bytes);
+String decodeAllowMalformed3(List<int> bytes) {
+  return new Utf8Codec(allowMalformed: true).decode(bytes);
+}
+
+String decode4(List<int> bytes) => new Utf8Codec().decoder.convert(bytes);
+String decodeAllowMalformed4(List<int> bytes) {
+  return new Utf8Codec(allowMalformed: true).decoder.convert(bytes);
+}
+
+final TESTS = [
+  // Unfinished UTF-8 sequences.
+  [ 0xc3 ],
+  [ 0xE2, 0x82 ],
+  [ 0xF0, 0xA4, 0xAD ],
+  // Overlong encoding of euro-sign.
+  [ 0xF0, 0x82, 0x82, 0xAC ],
+  // Other overlong/unfinished sequences.
+  [ 0xC0 ],
+  [ 0xC1 ],
+  [ 0xF5 ],
+  [ 0xF6 ],
+  [ 0xF7 ],
+  [ 0xF8 ],
+  [ 0xF9 ],
+  [ 0xFA ],
+  [ 0xFB ],
+  [ 0xFC ],
+  [ 0xFD ],
+  [ 0xFE ],
+  [ 0xFF ],
+  [ 0xC0, 0x80 ],
+  [ 0xC1, 0x80 ],
+  // Outside valid range.
+  [ 0xF4, 0xBF, 0xBF, 0xBF ]];
+
+final TESTS2 = [
+  // Test that 0xC0|1, 0x80 does not eat the next character.
+  [[ 0xC0, 0x80, 0x61 ], "Xa" ],
+  [[ 0xC1, 0x80, 0x61 ], "Xa" ],
+  // 0xF5 .. 0xFF never appear in valid UTF-8 sequences.
+  [[ 0xF5, 0x80 ], "XX" ],
+  [[ 0xF6, 0x80 ], "XX" ],
+  [[ 0xF7, 0x80 ], "XX" ],
+  [[ 0xF8, 0x80 ], "XX" ],
+  [[ 0xF9, 0x80 ], "XX" ],
+  [[ 0xFA, 0x80 ], "XX" ],
+  [[ 0xFB, 0x80 ], "XX" ],
+  [[ 0xFC, 0x80 ], "XX" ],
+  [[ 0xFD, 0x80 ], "XX" ],
+  [[ 0xFE, 0x80 ], "XX" ],
+  [[ 0xFF, 0x80 ], "XX" ],
+  [[ 0xF5, 0x80, 0x61 ], "XXa" ],
+  [[ 0xF6, 0x80, 0x61 ], "XXa" ],
+  [[ 0xF7, 0x80, 0x61 ], "XXa" ],
+  [[ 0xF8, 0x80, 0x61 ], "XXa" ],
+  [[ 0xF9, 0x80, 0x61 ], "XXa" ],
+  [[ 0xFA, 0x80, 0x61 ], "XXa" ],
+  [[ 0xFB, 0x80, 0x61 ], "XXa" ],
+  [[ 0xFC, 0x80, 0x61 ], "XXa" ],
+  [[ 0xFD, 0x80, 0x61 ], "XXa" ],
+  [[ 0xFE, 0x80, 0x61 ], "XXa" ],
+  [[ 0xFF, 0x80, 0x61 ], "XXa" ],
+  // Characters outside the valid range.
+  [[ 0xF5, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xF6, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xF7, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xF8, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xF9, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xFA, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xFB, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xFC, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xFD, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xFE, 0x80, 0x80, 0x61 ], "XXXa" ],
+  [[ 0xFF, 0x80, 0x80, 0x61 ], "XXXa" ]];
+
+main() {
+  var allTests = TESTS.expand((test) {
+    // Pairs of test and expected string output when malformed strings are
+    // allowed. Replacement character: U+FFFD
+    return [[ test, "\u{FFFD}" ],
+            [ new List.from([0x61])..addAll(test), "a\u{FFFD}" ],
+            [ new List.from([0x61])..addAll(test)..add(0x61), "a\u{FFFD}a" ],
+            [ new List.from(test)..add(0x61), "\u{FFFD}a" ],
+            [ new List.from(test)..addAll(test), "\u{FFFD}\u{FFFD}" ],
+            [ new List.from(test)..add(0x61)..addAll(test),
+              "\u{FFFD}a\u{FFFD}" ],
+            [ new List.from([0xc3, 0xa5])..addAll(test), "å\u{FFFD}" ],
+            [ new List.from([0xc3, 0xa5])..addAll(test)..addAll([0xc3, 0xa5]),
+              "å\u{FFFD}å" ],
+            [ new List.from(test)..addAll([0xc3, 0xa5]), "\u{FFFD}å" ],
+            [ new List.from(test)..addAll([0xc3, 0xa5])..addAll(test),
+              "\u{FFFD}å\u{FFFD}" ]];
+  });
+
+  var allTests2 = TESTS2.map((test) {
+    // Pairs of test and expected string output when malformed strings are
+    // allowed. Replacement character: U+FFFD
+    String expected = test[1].replaceAll("X", "\u{FFFD}");
+    return [test[0], expected];
+  });
+
+  for (var test in []..addAll(allTests)..addAll(allTests2)) {
+    List<int> bytes = test[0];
+    Expect.throws(() => decode(bytes), (e) => e is FormatException);
+    Expect.throws(() => decode2(bytes), (e) => e is FormatException);
+    Expect.throws(() => decode3(bytes), (e) => e is FormatException);
+    Expect.throws(() => decode4(bytes), (e) => e is FormatException);
+
+    String expected = test[1];
+    Expect.equals(expected, decodeAllowMalformed(bytes));
+    Expect.equals(expected, decodeAllowMalformed2(bytes));
+    Expect.equals(expected, decodeAllowMalformed3(bytes));
+    Expect.equals(expected, decodeAllowMalformed4(bytes));
+  }
+}
--- a/tests/lib/convert/utf83_test.dart
+++ b/tests/lib/convert/utf83_test.dart
@ -0,0 +1,46 @@
+// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+library utf8_test;
+import "package:expect/expect.dart";
+import 'dart:codec';
+import 'dart:convert';
+
+main() {
+  // Test that UTF8-decoder removes leading BOM.
+  Expect.equals("a", UTF8.decode([0xEF, 0xBB, 0xBF, 0x61]));
+  Expect.equals("a", UTF8.decoder.convert([0xEF, 0xBB, 0xBF, 0x61]));
+  Expect.equals("a", new Utf8Decoder().convert([0xEF, 0xBB, 0xBF, 0x61]));
+  Expect.equals("a",
+                UTF8.decode([0xEF, 0xBB, 0xBF, 0x61], allowMalformed: true));
+  Expect.equals("a", new Utf8Codec(allowMalformed: true)
+                         .decode([0xEF, 0xBB, 0xBF, 0x61]));
+  Expect.equals("a", new Utf8Codec(allowMalformed: true)
+                         .decoder.convert([0xEF, 0xBB, 0xBF, 0x61]));
+  Expect.equals("a", new Utf8Decoder(allowMalformed: true)
+                         .convert([0xEF, 0xBB, 0xBF, 0x61]));
+  Expect.equals("", UTF8.decode([0xEF, 0xBB, 0xBF]));
+  Expect.equals("", UTF8.decoder.convert([0xEF, 0xBB, 0xBF]));
+  Expect.equals("", new Utf8Decoder().convert([0xEF, 0xBB, 0xBF]));
+  Expect.equals("",
+                UTF8.decode([0xEF, 0xBB, 0xBF], allowMalformed: true));
+  Expect.equals("", new Utf8Codec(allowMalformed: true)
+                         .decode([0xEF, 0xBB, 0xBF]));
+  Expect.equals("", new Utf8Codec(allowMalformed: true)
+                         .decoder.convert([0xEF, 0xBB, 0xBF]));
+  Expect.equals("", new Utf8Decoder(allowMalformed: true)
+                         .convert([0xEF, 0xBB, 0xBF]));
+  Expect.equals("a\u{FEFF}", UTF8.decode([0x61, 0xEF, 0xBB, 0xBF]));
+  Expect.equals("a\u{FEFF}", UTF8.decoder.convert([0x61, 0xEF, 0xBB, 0xBF]));
+  Expect.equals("a\u{FEFF}",
+                new Utf8Decoder().convert([0x61, 0xEF, 0xBB, 0xBF]));
+  Expect.equals("a\u{FEFF}",
+                UTF8.decode([0x61, 0xEF, 0xBB, 0xBF], allowMalformed: true));
+  Expect.equals("a\u{FEFF}", new Utf8Codec(allowMalformed: true)
+                         .decode([0x61, 0xEF, 0xBB, 0xBF]));
+  Expect.equals("a\u{FEFF}", new Utf8Codec(allowMalformed: true)
+                         .decoder.convert([0x61, 0xEF, 0xBB, 0xBF]));
+  Expect.equals("a\u{FEFF}", new Utf8Decoder(allowMalformed: true)
+                         .convert([0x61, 0xEF, 0xBB, 0xBF]));
+}
--- a/tests/lib/convert/utf8_test.dart
+++ b/tests/lib/convert/utf8_test.dart
@ -0,0 +1,47 @@
+// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+library utf8_test;
+import "package:expect/expect.dart";
+import 'dart:convert';
+
+String decode(List<int> bytes) => new Utf8Decoder().convert(bytes);
+
+main() {
+  // Google favorite: "Îñţérñåţîöñåļîžåţîờñ".
+  String string = decode([0xc3, 0x8e, 0xc3, 0xb1, 0xc5, 0xa3, 0xc3, 0xa9, 0x72,
+                          0xc3, 0xb1, 0xc3, 0xa5, 0xc5, 0xa3, 0xc3, 0xae, 0xc3,
+                          0xb6, 0xc3, 0xb1, 0xc3, 0xa5, 0xc4, 0xbc, 0xc3, 0xae,
+                          0xc5, 0xbe, 0xc3, 0xa5, 0xc5, 0xa3, 0xc3, 0xae, 0xe1,
+                          0xbb, 0x9d, 0xc3, 0xb1]);
+  Expect.stringEquals("Îñţérñåţîöñåļîžåţîờñ", string);
+
+  // Blueberry porridge in Danish: "blåbærgrød".
+  string = decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, 0x72, 0x67, 0x72,
+                   0xc3, 0xb8, 0x64]);
+  Expect.stringEquals("blåbærgrød", string);
+
+  // "சிவா அணாமாைல", that is "Siva Annamalai" in Tamil.
+  string = decode([0xe0, 0xae, 0x9a, 0xe0, 0xae, 0xbf, 0xe0, 0xae, 0xb5, 0xe0,
+                   0xae, 0xbe, 0x20, 0xe0, 0xae, 0x85, 0xe0, 0xae, 0xa3, 0xe0,
+                   0xae, 0xbe, 0xe0, 0xae, 0xae, 0xe0, 0xae, 0xbe, 0xe0, 0xaf,
+                   0x88, 0xe0, 0xae, 0xb2]);
+  Expect.stringEquals("சிவா அணாமாைல", string);
+
+  // "िसवा अणामालै", that is "Siva Annamalai" in Devanagari.
+  string = decode([0xe0, 0xa4, 0xbf, 0xe0, 0xa4, 0xb8, 0xe0, 0xa4, 0xb5, 0xe0,
+                   0xa4, 0xbe, 0x20, 0xe0, 0xa4, 0x85, 0xe0, 0xa4, 0xa3, 0xe0,
+                   0xa4, 0xbe, 0xe0, 0xa4, 0xae, 0xe0, 0xa4, 0xbe, 0xe0, 0xa4,
+                   0xb2, 0xe0, 0xa5, 0x88]);
+  Expect.stringEquals("िसवा अणामालै", string);
+
+  // DESERET CAPITAL LETTER BEE, unicode 0x10412(0xD801+0xDC12)
+  // UTF-8: F0 90 90 92
+  string = decode([0xf0, 0x90, 0x90, 0x92]);
+  Expect.equals(string.length, 2);
+  Expect.equals("𐐒".length, 2);
+  Expect.stringEquals("𐐒", string);
+
+  // TODO(ahe): Add tests of bad input.
+}