Bi-directional iterator for runes of a string.

Review URL: https://codereview.chromium.org//12094056

git-svn-id: https://dart.googlecode.com/svn/branches/bleeding_edge/dart@18382 260f80e4-7a28-3924-810f-c04153c831b5
This commit is contained in:
lrn@google.com 2013-02-12 15:22:31 +00:00
parent 10d73f779b
commit 3520e88c04
6 changed files with 295 additions and 8 deletions

View file

@ -437,9 +437,7 @@ class _StringBase {
throw new UnimplementedError("String.codeUnits");
}
Iterable<int> get runes {
throw new UnimplementedError("String.runes");
}
Runes get runes => new Runes(this);
String toUpperCase() native "String_toUpperCase";

View file

@ -683,3 +683,8 @@ class EmptyIterator<E> implements Iterator<E> {
bool moveNext() => false;
E get current => null;
}
/** An [Iterator] that can move in both directions. */
abstract class BiDirectionalIterator<T> implements Iterator<T> {
bool movePrevious();
}

View file

@ -153,9 +153,7 @@ class JSString implements String {
throw new UnimplementedError("String.codeUnits");
}
Iterable<int> get runes {
throw new UnimplementedError("String.runes");
}
Runes get runes => new Runes(this);
int indexOf(String other, [int start = 0]) {
checkNull(other);

View file

@ -432,3 +432,16 @@ class _GeneratorIterator<E> implements Iterator<E> {
E get current => _current;
}
/**
* An [Iterator] that allows moving backwards as well as forwards.
*/
abstract class BiDirectionalIterator<T> extends Iterator<T> {
/**
* Move back to the previous element.
*
* Returns true and updates [current] if successful. Returns false
* and sets [current] to null if there is no previous element.
*/
bool movePrevious();
}

View file

@ -280,8 +280,7 @@ abstract class String implements Comparable, Pattern {
* as one integer by this iterator. Unmatched surrogate halves are treated
* like valid 16-bit code-units.
*/
// TODO(floitsch): make it a Runes class.
Iterable<int> get runes;
Runes get runes;
/**
* If this string is not already all lower case, returns a new string
@ -297,3 +296,199 @@ abstract class String implements Comparable, Pattern {
// TODO(floitsch): document better. (See EcmaScript for description).
String toUpperCase();
}
/**
* The runes of a [String].
*/
class Runes extends Iterable<int> {
final String string;
Runes(this.string);
RuneIterator get iterator => new RuneIterator(string);
int get last {
if (string.length == 0) {
throw new StateError("No elements.");
}
int length = string.length;
int code = string.charCodeAt(length - 1);
if (_isTrailSurrogate(code) && string.length > 1) {
int previousCode = string.charCodeAt(length - 2);
if (_isLeadSurrogate(previousCode)) {
return _combineSurrogatePair(previousCode, code);
}
}
return code;
}
}
// Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
// Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
// Combine a lead and a trail surrogate value into a single code point.
int _combineSurrogatePair(int start, int end) {
return 0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF);
}
/** [Iterator] for reading Unicode code points out of a Dart string. */
class RuneIterator implements BiDirectionalIterator<int> {
/** String being iterated. */
final String string;
/** Position before the current code point. */
int _position;
/** Position after the current code point. */
int _nextPosition;
/**
* Current code point.
*
* If the iterator has hit either end, the [_currentCodePoint] is null
* and [: _position == _nextPosition :].
*/
int _currentCodePoint;
/** Create an iterator positioned at the beginning of the string. */
RuneIterator(String string)
: this.string = string, _position = 0, _nextPosition = 0;
/**
* Create an iterator positioned before the [index]th code unit of the string.
*
* When created, there is no [current] value.
* A [moveNext] will use the rune starting at [index] the current value,
* and a [movePrevious] will use the rune ending just before [index] as the
* the current value.
*
* It is an error if the [index] position is in the middle of a surrogate
* pair.
*/
RuneIterator.at(String string, int index)
: string = string, _position = index, _nextPosition = index {
if (index < 0 || index > string.length) {
throw new RangeError.range(index, 0, string.length);
}
_checkSplitSurrogate(index);
}
/** Throw an error if the index is in the middle of a surrogate pair. */
void _checkSplitSurrogate(int index) {
if (index > 0 && index < string.length &&
_isLeadSurrogate(string.charCodeAt(index - 1)) &&
_isTrailSurrogate(string.charCodeAt(index))) {
throw new ArgumentError("Index inside surrogate pair: $index");
}
}
/**
* Returns the starting position of the current rune in the string.
*
* Returns null if the [current] rune is null.
*/
int get rawIndex => (_position != _nextPosition) ? _position : null;
/**
* Resets the iterator to the rune at the specified index of the string.
*
* Setting a negative [rawIndex], or one greater than or equal to
* [:string.length:],
* is an error. So is setting it in the middle of a surrogate pair.
*
* Setting the position to the end of then string will set [current] to null.
*/
void set rawIndex(int rawIndex) {
if (rawIndex >= string.length) {
throw new RangeError.range(rawIndex, 0, string.length - 1);
}
reset(rawIndex);
moveNext();
}
/**
* Resets the iterator to the given index into the string.
*
* After this the [current] value is unset.
* You must call [moveNext] make the rune at the position current,
* or [movePrevious] for the last rune before the position.
*
* Setting a negative [rawIndex], or one greater than [:string.length:],
* is an error. So is setting it in the middle of a surrogate pair.
*/
void reset([int rawIndex = 0]) {
if (rawIndex < 0 || rawIndex > string.length) {
throw new RangeError.range(rawIndex, 0, string.length);
}
_checkSplitSurrogate(rawIndex);
_position = _nextPosition = rawIndex;
_currentCodePoint = null;
}
/** The rune starting at the current position in the string. */
int get current => _currentCodePoint;
/**
* The number of code units comprising the current rune.
*
* Returns zero if there is no current rune ([current] is null).
*/
int get currentSize => _nextPosition - _position;
/**
* A string containing the current rune.
*
* For runes outside the basic multilingual plane, this will be
* a two-character String.
*
* Returns null if [current] is null.
*/
String get currentAsString {
if (_position == _nextPosition) return null;
if (_position + 1 == _nextPosition) return string[_position];
return string.substring(_position, _nextPosition);
}
bool moveNext() {
_position = _nextPosition;
if (_position == string.length) {
_currentCodePoint = null;
return false;
}
int codeUnit = string.charCodeAt(_position);
int nextPosition = _position + 1;
if (_isLeadSurrogate(codeUnit) && nextPosition < string.length) {
int nextCodeUnit = string.charCodeAt(nextPosition);
if (_isTrailSurrogate(nextCodeUnit)) {
_nextPosition = nextPosition + 1;
_currentCodePoint = _combineSurrogatePair(codeUnit, nextCodeUnit);
return true;
}
}
_nextPosition = nextPosition;
_currentCodePoint = codeUnit;
return true;
}
bool movePrevious() {
_nextPosition = _position;
if (_position == 0) {
_currentCodePoint = null;
return false;
}
int position = _position - 1;
int codeUnit = string.charCodeAt(position);
if (_isTrailSurrogate(codeUnit) && position > 0) {
int prevCodeUnit = string.charCodeAt(position - 1);
if (_isLeadSurrogate(prevCodeUnit)) {
_position = position - 1;
_currentCodePoint = _combineSurrogatePair(prevCodeUnit, codeUnit);
return true;
}
}
_position = position;
_currentCodePoint = codeUnit;
return true;
}
}

View file

@ -0,0 +1,78 @@
// Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
main() {
test(String s, List<int> expectedRunes) {
Runes runes = s.runes;
Expect.identical(s, runes.string);
// for-in
var res = [];
for (int rune in runes) {
res.add(rune);
}
Expect.listEquals(expectedRunes, res);
// manual iteration, backwards.
res = [];
for (var it = runes.iterator..reset(s.length); it.movePrevious();) {
res.add(it.current);
}
Expect.listEquals(expectedRunes.reversed.toList(), res);
// Setting rawIndex.
RuneIterator it = runes.iterator;
it.rawIndex = 1;
Expect.equals(expectedRunes[1], it.current);
it = runes.iterator;
it.moveNext();
Expect.equals(0, it.rawIndex);
it.moveNext();
Expect.equals(1, it.rawIndex);
it.moveNext();
Expect.isTrue(1 < it.rawIndex);
it.rawIndex = 1;
Expect.equals(1, it.rawIndex);
Expect.equals(expectedRunes[1], it.current);
// Reset, moveNext.
it.reset(1);
Expect.equals(null, it.rawIndex);
Expect.equals(null, it.current);
it.moveNext();
Expect.equals(1, it.rawIndex);
Expect.equals(expectedRunes[1], it.current);
// Reset, movePrevious.
it.reset(1);
Expect.equals(null, it.rawIndex);
Expect.equals(null, it.current);
it.movePrevious();
Expect.equals(0, it.rawIndex);
Expect.equals(expectedRunes[0], it.current);
// .map
Expect.listEquals(expectedRunes.map((x) => x.toRadixString(16)).toList(),
runes.map((x) => x.toRadixString(16)).toList());
}
// First character must be single-code-unit for test.
test("abc", [0x61, 0x62, 0x63]);
test("\x00\u0000\u{000000}", [0, 0, 0]);
test("\u{ffff}\u{10000}\u{10ffff}", [0xffff, 0x10000, 0x10ffff]);
String string = new String.fromCharCodes(
[0xdc00, 0xd800, 61, 0xd800, 0xdc00, 62, 0xdc00, 0xd800]);
test(string, [0xdc00, 0xd800, 61, 0x10000, 62, 0xdc00, 0xd800]);
// Setting position in the middle of a surrogate pair is not allowed.
var r = new Runes("\u{10000}");
var it = r.iterator;
it.moveNext();
Expect.equals(0x10000, it.current);
// Setting rawIndex inside surrogate pair.
Expect.throws(() { it.rawIndex = 1; }, (e) => e is Error);
Expect.throws(() { it.reset(1); }, (e) => e is Error);
}