mirror of
https://github.com/dart-lang/sdk
synced 2024-09-15 22:19:49 +00:00
Allow surrogates in string literals.
Fixes issue #26620 BUG: http://dartbug.com/26620 R=asiva@google.com, brianwilkerson@google.com, floitsch@google.com, hausner@google.com, sigmund@google.com Review URL: https://codereview.chromium.org/2304923002 .
This commit is contained in:
parent
3a4b32d957
commit
574ae435f3
|
@ -2790,7 +2790,7 @@ Strings support escape sequences for special characters. The escapes are:
|
|||
|
||||
$\backslash$u\{$HEX\_DIGIT_1$ $HEX\_DIGIT_2$\}.
|
||||
\item $\backslash$u $HEX\_DIGIT_1$ $HEX\_DIGIT_2$ $HEX\_DIGIT_3$ $HEX\_DIGIT_4$, equivalent to $\backslash$u\{$HEX\_DIGIT_1$ $HEX\_DIGIT_2$ $HEX\_DIGIT_3$ $HEX\_DIGIT_4$\}.
|
||||
\item $\backslash$u\{$HEX\_DIGIT\_SEQUENCE$\} is the unicode scalar value represented by the $HEX\_DIGIT\_SEQUENCE$. It is a compile-time error if the value of the $HEX\_DIGIT\_SEQUENCE$ is not a valid unicode scalar value.
|
||||
\item $\backslash$u\{$HEX\_DIGIT\_SEQUENCE$\} is the Unicode code point represented by the $HEX\_DIGIT\_SEQUENCE$. It is a compile-time error if the value of the $HEX\_DIGIT\_SEQUENCE$ is not a valid Unicode code point.
|
||||
\item \$ indicating the beginning of an interpolated expression.
|
||||
\item Otherwise, $\backslash k$ indicates the character $k$ for any $k$ not in $\{n, r, f, b, t, v, x, u\}$.
|
||||
\end{itemize}
|
||||
|
|
|
@ -5370,25 +5370,24 @@ class Parser {
|
|||
}
|
||||
|
||||
/**
|
||||
* Append the character equivalent of the given [scalarValue] to the given
|
||||
* Append the character equivalent of the given [codePoint] to the given
|
||||
* [builder]. Use the [startIndex] and [endIndex] to report an error, and
|
||||
* don't append anything to the builder, if the scalar value is invalid. The
|
||||
* don't append anything to the builder, if the code point is invalid. The
|
||||
* [escapeSequence] is the escape sequence that was parsed to produce the
|
||||
* scalar value (used for error reporting).
|
||||
* code point (used for error reporting).
|
||||
*/
|
||||
void _appendScalarValue(StringBuffer buffer, String escapeSequence,
|
||||
int scalarValue, int startIndex, int endIndex) {
|
||||
if (scalarValue < 0 ||
|
||||
scalarValue > Character.MAX_CODE_POINT ||
|
||||
(scalarValue >= 0xD800 && scalarValue <= 0xDFFF)) {
|
||||
void _appendCodePoint(StringBuffer buffer, String source,
|
||||
int codePoint, int startIndex, int endIndex) {
|
||||
if (codePoint < 0 || codePoint > Character.MAX_CODE_POINT) {
|
||||
String escapeSequence = source.substring(startIndex, endIndex + 1);
|
||||
_reportErrorForCurrentToken(
|
||||
ParserErrorCode.INVALID_CODE_POINT, [escapeSequence]);
|
||||
return;
|
||||
}
|
||||
if (scalarValue < Character.MAX_VALUE) {
|
||||
buffer.writeCharCode(scalarValue);
|
||||
if (codePoint < Character.MAX_VALUE) {
|
||||
buffer.writeCharCode(codePoint);
|
||||
} else {
|
||||
buffer.write(Character.toChars(scalarValue));
|
||||
buffer.write(Character.toChars(codePoint));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7747,8 +7746,7 @@ class Parser {
|
|||
// Illegal escape sequence: not enough or too many hex digits
|
||||
_reportErrorForCurrentToken(ParserErrorCode.INVALID_UNICODE_ESCAPE);
|
||||
}
|
||||
_appendScalarValue(buffer, lexeme.substring(index, currentIndex + 1),
|
||||
value, index, currentIndex);
|
||||
_appendCodePoint(buffer, lexeme, value, index, currentIndex);
|
||||
return currentIndex + 1;
|
||||
} else {
|
||||
if (currentIndex + 3 >= length) {
|
||||
|
@ -7767,9 +7765,9 @@ class Parser {
|
|||
// Illegal escape sequence: invalid hex digits
|
||||
_reportErrorForCurrentToken(ParserErrorCode.INVALID_UNICODE_ESCAPE);
|
||||
} else {
|
||||
_appendScalarValue(
|
||||
_appendCodePoint(
|
||||
buffer,
|
||||
lexeme.substring(index, currentIndex + 1),
|
||||
lexeme,
|
||||
(((((Character.digit(firstDigit, 16) << 4) +
|
||||
Character.digit(secondDigit, 16)) <<
|
||||
4) +
|
||||
|
|
|
@ -1468,7 +1468,7 @@ class Foo {
|
|||
}
|
||||
|
||||
void test_invalidCodePoint() {
|
||||
createParser("'\\uD900'");
|
||||
createParser("'\\u{110000}'");
|
||||
StringLiteral literal = parser.parseStringLiteral();
|
||||
expectNotNullIfNoErrors(literal);
|
||||
listener.assertErrorsWithCodes([ParserErrorCode.INVALID_CODE_POINT]);
|
||||
|
|
|
@ -111,13 +111,10 @@ class StringValidator {
|
|||
Token token, int startOffset, String string, StringQuoting quoting) {
|
||||
// We need to check for invalid x and u escapes, for line
|
||||
// terminators in non-multiline strings, and for invalid Unicode
|
||||
// scalar values (either directly or as u-escape values). We also check
|
||||
// for unpaired UTF-16 surrogates.
|
||||
// code points (either directly or as u-escape values).
|
||||
int length = 0;
|
||||
int index = startOffset;
|
||||
bool containsEscape = false;
|
||||
bool previousWasLeadSurrogate = false;
|
||||
bool invalidUtf16 = false;
|
||||
var stringIter = string.codeUnits.iterator;
|
||||
for (HasNextIterator<int> iter = new HasNextIterator(stringIter);
|
||||
iter.hasNext;
|
||||
|
@ -199,26 +196,13 @@ class StringValidator {
|
|||
code = value;
|
||||
}
|
||||
}
|
||||
if (code >= 0x10000) length++;
|
||||
// This handles both unescaped characters and the value of unicode
|
||||
// escapes.
|
||||
if (previousWasLeadSurrogate) {
|
||||
if (!isUtf16TrailSurrogate(code)) {
|
||||
invalidUtf16 = true;
|
||||
break;
|
||||
if (code >= 0x10000) {
|
||||
length++;
|
||||
if (code > 0x10FFFF) {
|
||||
stringParseError("Invalid code point", token, index);
|
||||
}
|
||||
previousWasLeadSurrogate = false;
|
||||
} else if (isUtf16LeadSurrogate(code)) {
|
||||
previousWasLeadSurrogate = true;
|
||||
} else if (!isUnicodeScalarValue(code)) {
|
||||
invalidUtf16 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (previousWasLeadSurrogate || invalidUtf16) {
|
||||
stringParseError("Invalid Utf16 surrogate", token, index);
|
||||
return null;
|
||||
}
|
||||
// String literal successfully validated.
|
||||
if (quoting.raw || !containsEscape) {
|
||||
// A string without escapes could just as well have been raw.
|
||||
|
|
|
@ -484,9 +484,7 @@ void Scanner::ScanEscapedCodePoint(int32_t* code_point) {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (is_valid &&
|
||||
((Utf::IsOutOfRange(*code_point) ||
|
||||
(Utf16::IsSurrogate(*code_point))))) {
|
||||
if (is_valid && (Utf::IsOutOfRange(*code_point))) {
|
||||
ErrorMsg("invalid code point");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -192,62 +192,6 @@ static void StringEscapes() {
|
|||
|
||||
|
||||
static void InvalidStringEscapes() {
|
||||
const GrowableTokenStream& high_start_4 =
|
||||
Scan("\"\\uD800\"");
|
||||
EXPECT_EQ(2, high_start_4.length());
|
||||
CheckKind(high_start_4, 0, Token::kERROR);
|
||||
EXPECT(high_start_4[0].literal->Equals("invalid code point"));
|
||||
CheckKind(high_start_4, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& high_start_seq =
|
||||
Scan("\"\\u{D800}\"");
|
||||
EXPECT_EQ(2, high_start_seq.length());
|
||||
CheckKind(high_start_seq, 0, Token::kERROR);
|
||||
EXPECT(high_start_seq[0].literal->Equals("invalid code point"));
|
||||
CheckKind(high_start_seq, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& high_end_4 =
|
||||
Scan("\"\\uDBFF\"");
|
||||
EXPECT_EQ(2, high_end_4.length());
|
||||
CheckKind(high_end_4, 0, Token::kERROR);
|
||||
EXPECT(high_end_4[0].literal->Equals("invalid code point"));
|
||||
CheckKind(high_end_4, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& high_end_seq =
|
||||
Scan("\"\\u{DBFF}\"");
|
||||
EXPECT_EQ(2, high_end_seq.length());
|
||||
CheckKind(high_end_seq, 0, Token::kERROR);
|
||||
EXPECT(high_end_seq[0].literal->Equals("invalid code point"));
|
||||
CheckKind(high_end_seq, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& low_start_4 =
|
||||
Scan("\"\\uDC00\"");
|
||||
EXPECT_EQ(2, low_start_4.length());
|
||||
CheckKind(low_start_4, 0, Token::kERROR);
|
||||
EXPECT(low_start_4[0].literal->Equals("invalid code point"));
|
||||
CheckKind(low_start_4, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& low_start_seq =
|
||||
Scan("\"\\u{DC00}\"");
|
||||
EXPECT_EQ(2, low_start_seq.length());
|
||||
CheckKind(low_start_seq, 0, Token::kERROR);
|
||||
EXPECT(low_start_seq[0].literal->Equals("invalid code point"));
|
||||
CheckKind(low_start_seq, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& low_end_4 =
|
||||
Scan("\"\\uDFFF\"");
|
||||
EXPECT_EQ(2, low_end_4.length());
|
||||
CheckKind(low_end_4, 0, Token::kERROR);
|
||||
EXPECT(low_end_4[0].literal->Equals("invalid code point"));
|
||||
CheckKind(low_end_4, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& low_end_seq =
|
||||
Scan("\"\\u{DFFF}\"");
|
||||
EXPECT_EQ(2, low_end_seq.length());
|
||||
CheckKind(low_end_seq, 0, Token::kERROR);
|
||||
EXPECT(low_end_seq[0].literal->Equals("invalid code point"));
|
||||
CheckKind(low_end_seq, 1, Token::kEOS);
|
||||
|
||||
const GrowableTokenStream& out_of_range_low =
|
||||
Scan("\"\\u{110000}\"");
|
||||
EXPECT_EQ(2, out_of_range_low.length());
|
||||
|
|
40
tests/language/string_literals_test.dart
Normal file
40
tests/language/string_literals_test.dart
Normal file
|
@ -0,0 +1,40 @@
|
|||
// Copyright (c) 2016, the Dart project authors. Please see the AUTHORS file
|
||||
// for details. All rights reserved. Use of this source code is governed by a
|
||||
// BSD-style license that can be found in the LICENSE file.
|
||||
|
||||
import "package:expect/expect.dart";
|
||||
|
||||
main() {
|
||||
var expect = new String.fromCharCodes([
|
||||
0, 0x0a, 0x0d, 0x7f, 0xff, 0xffff, 0xd800, 0xdc00, 0xdbff, 0xdfff
|
||||
]);
|
||||
test(string) {
|
||||
Expect.equals(expect, string);
|
||||
}
|
||||
|
||||
// Plain escapes of code points.
|
||||
test("\x00\x0a\x0d\x7f\xff\uffff\u{10000}\u{10ffff}");
|
||||
test("""\x00\x0a\x0d\x7f\xff\uffff\u{10000}\u{10ffff}""");
|
||||
test('\x00\x0a\x0d\x7f\xff\uffff\u{10000}\u{10ffff}');
|
||||
test('''\x00\x0a\x0d\x7f\xff\uffff\u{10000}\u{10ffff}''');
|
||||
// Plain escapes of individual code units.
|
||||
test("\x00\x0a\x0d\x7f\xff\uffff\ud800\udc00\udbff\udfff");
|
||||
test("""\x00\x0a\x0d\x7f\xff\uffff\ud800\udc00\udbff\udfff""");
|
||||
test('\x00\x0a\x0d\x7f\xff\uffff\ud800\udc00\udbff\udfff');
|
||||
test('''\x00\x0a\x0d\x7f\xff\uffff\ud800\udc00\udbff\udfff''');
|
||||
// Insert newline into multiline string.
|
||||
test("""\x00
|
||||
\x0d\x7f\xff\uffff\ud800\udc00\udbff\udfff""");
|
||||
test('''\x00
|
||||
\x0d\x7f\xff\uffff\ud800\udc00\udbff\udfff''');
|
||||
// Extract code points from multi-character escape string.
|
||||
test("\x00\x0a\x0d\x7f\xff\uffff"
|
||||
"${"\u{10000}"[0]}${"\u{10000}"[1]}"
|
||||
"${"\u{10FFFF}"[0]}${"\u{10FFFF}"[1]}");
|
||||
test("\x00\x0a\x0d\x7f\xff\uffff" +
|
||||
"\ud800" + "\udc00\udbff" + "\udfff");
|
||||
// Single line string over multiple lines with newlines inside interpolation.
|
||||
test("\x00\x0a\x0d\x7f\xff${
|
||||
""
|
||||
}\uffff\ud800\udc00\udbff\udfff");
|
||||
}
|
|
@ -5,10 +5,8 @@
|
|||
class StringUnicode4NegativeTest {
|
||||
|
||||
static testMain() {
|
||||
// Unicode escapes must refer to valid Unicode points and not surrogate characters
|
||||
// Unicode escapes must refer to valid Unicode points.
|
||||
String str = "Foo\u{FFFFFF}";
|
||||
str = "Foo\uD800";
|
||||
str = "Foo\uDC00";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue