Optimize LineSplitter.

Avoid quadratic behavior when multiple chunks fail to
have a line break, and the carry-over string gets repeatedly extended.

In the original chunked conversion code, the chunk handling code retained the trailing, non-line-terminated text of the previous chunk, then eagerly concatenated it with the next chunk in order to continue looking for lines. That's moderately effective when lines are shorter than chunks, and neither are too large.
However, a very long line spread across many chunks would perform repeated string concatenation with quadratic time complexity.

This change gives `LineSplitter` the option of using a `StringBuffer` to collect multiple carry-over line parts.
The buffer is needed whenever a chunk does not contain a line break, and needs to be combined with a previous chunk's carry-over. This avoids ever directly concatenating any more than two strings.
The `StringBuffer` is not allocated until it's first needed, so if lines are generally shorter than chunks, the buffer won't be used. Once allocated, the buffer is retained in case a buffer will be needed again, but cleared when its contents are used.

The code optimizes for the simple case of each chunk having a line break.

Fixes #51167

Bug: https://github.com/dart-lang/sdk/issues/51167
Change-Id: I600a011e02aa9f1ad6f88e45764df5b2e8eccfa3
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/280100
Reviewed-by: Leaf Petersen <leafp@google.com>
Reviewed-by: Stephen Adams <sra@google.com>
Commit-Queue: Lasse Nielsen <lrn@google.com>
Reviewed-by: Siva Annamalai <asiva@google.com>
Reviewed-by: Aske Simon Christensen <askesc@google.com>
Reviewed-by: Nate Bosch <nbosch@google.com>
Reviewed-by: Sigmund Cherem <sigmund@google.com>
This commit is contained in:
Lasse R.H. Nielsen 2023-02-21 11:33:24 +00:00 committed by Commit Queue
parent 3ab2dfaaaa
commit 164f8588d6
5 changed files with 320 additions and 29 deletions

View file

@ -84,7 +84,6 @@ class LineSplitter extends StreamTransformerBase<String, String> {
}
}
// TODO(floitsch): deal with utf8.
class _LineSplitterSink extends StringConversionSinkBase {
final StringConversionSink _sink;
@ -92,8 +91,22 @@ class _LineSplitterSink extends StringConversionSinkBase {
///
/// If the previous slice ended in a line without a line terminator,
/// then the next slice may continue the line.
///
/// Set to `null` if there is no carry (the previous chunk ended on
/// a line break).
/// Set to an empty string if carry-over comes from multiple chunks,
/// in which case the parts are stored in [_multiCarry].
String? _carry;
/// Cache of multiple parts of carry-over.
///
/// If a line is split over multiple chunks, avoid doing
/// repeated string concatenation, and instead store the chunks
/// into this stringbuffer.
///
/// Is empty when `_carry` is `null` or a non-empty string.
StringBuffer? _multiCarry;
/// Whether to skip a leading LF character from the next slice.
///
/// If the previous slice ended on a CR character, a following LF
@ -108,38 +121,31 @@ class _LineSplitterSink extends StringConversionSinkBase {
end = RangeError.checkValidRange(start, end, chunk.length);
// If the chunk is empty, it's probably because it's the last one.
// Handle that here, so we know the range is non-empty below.
if (start >= end) {
if (isLast) close();
return;
}
String? carry = _carry;
if (carry != null) {
assert(!_skipLeadingLF);
chunk = carry + chunk.substring(start, end);
start = 0;
end = chunk.length;
_carry = null;
} else if (_skipLeadingLF) {
if (chunk.codeUnitAt(start) == _LF) {
start += 1;
if (start < end) {
if (_skipLeadingLF) {
if (chunk.codeUnitAt(start) == _LF) {
start += 1;
}
_skipLeadingLF = false;
}
_skipLeadingLF = false;
_addLines(chunk, start, end, isLast);
}
_addLines(chunk, start, end);
if (isLast) close();
}
void close() {
if (_carry != null) {
_sink.add(_carry!);
_carry = null;
var carry = _carry;
if (carry != null) {
_sink.add(_useCarry(carry, ""));
}
_sink.close();
}
void _addLines(String lines, int start, int end) {
void _addLines(String lines, int start, int end, bool isLast) {
var sliceStart = start;
var char = 0;
var carry = _carry;
for (var i = start; i < end; i++) {
var previousChar = char;
char = lines.codeUnitAt(i);
@ -150,15 +156,81 @@ class _LineSplitterSink extends StringConversionSinkBase {
continue;
}
}
_sink.add(lines.substring(sliceStart, i));
var slice = lines.substring(sliceStart, i);
if (carry != null) {
slice = _useCarry(carry, slice); // Resets _carry to `null`.
carry = null;
}
_sink.add(slice);
sliceStart = i + 1;
}
if (sliceStart < end) {
_carry = lines.substring(sliceStart, end);
var endSlice = lines.substring(sliceStart, end);
if (isLast) {
// Emit last line instead of carrying it over to the
// immediately following `close` call.
if (carry != null) {
endSlice = _useCarry(carry, endSlice);
}
_sink.add(endSlice);
return;
}
if (carry == null) {
// Common case, this chunk contained at least one line-break.
_carry = endSlice;
} else {
_addCarry(carry, endSlice);
}
} else {
_skipLeadingLF = (char == _CR);
}
}
/// Adds [newCarry] to existing carry-over.
///
/// Always goes into [_multiCarry], we only call here if there
/// was an existing carry that the new carry needs to be combined with.
///
/// Only happens when a line is spread over more than two chunks.
/// The [existingCarry] is always the current value of [_carry].
/// (We pass the existing carry as an argument because we have already
/// checked that it is non-`null`.)
void _addCarry(String existingCarry, String newCarry) {
assert(existingCarry == _carry);
assert(newCarry.isNotEmpty);
var multiCarry = _multiCarry ??= StringBuffer();
if (existingCarry.isNotEmpty) {
assert(multiCarry.isEmpty);
multiCarry.write(existingCarry);
_carry = "";
}
multiCarry.write(newCarry);
}
/// Consumes and combines existing carry-over with continuation string.
///
/// The [carry] value is always the current value of [_carry],
/// which is non-`null` when this method is called.
/// If that value is the empty string, the actual carry-over is stored
/// in [_multiCarry].
///
/// The [continuation] is only empty if called from [close].
String _useCarry(String carry, String continuation) {
assert(carry == _carry);
_carry = null;
if (carry.isNotEmpty) {
return carry + continuation;
}
var multiCarry = _multiCarry!;
multiCarry.write(continuation);
var result = multiCarry.toString();
// If it happened once, it may happen again.
// Keep the string buffer around.
multiCarry.clear();
return result;
}
}
class _LineSplitterEventSink extends _LineSplitterSink

View file

@ -0,0 +1,39 @@
// Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library line_splitter_test;
import 'dart:convert';
import "package:expect/expect.dart";
void main() {
testEfficency();
}
/// Regression test for https://dartbug.com/51167
///
/// Had quadratic time behavior when concatenating chunks without linebreaks.
///
/// Should now only use linear time/space for buffering.
void testEfficency() {
// After fix: finishes in < 1 second on desktop.
// Before fix, with N = 100000, took 25 seconds.
const N = 1000000;
String result = ""; // Starts empty, set once.
var sink = LineSplitter()
.startChunkedConversion(ChunkedConversionSink.withCallback((lines) {
// Gets called only once with exactly one line.
Expect.equals("", result);
Expect.equals(1, lines.length);
var line = lines.first;
Expect.notEquals("", line);
result = line;
}));
for (var i = 0; i < N; i++) {
sink.add("xy");
}
sink.close();
Expect.equals("xy" * N, result);
}

View file

@ -4,10 +4,11 @@
library line_splitter_test;
import "package:expect/expect.dart";
import 'dart:async';
import 'dart:convert';
import 'dart:math' as MATH;
import 'dart:math';
import "package:expect/expect.dart";
const lineTerminators = const ['\n', '\r', '\r\n'];
@ -19,6 +20,7 @@ void main() {
testReadLine1();
testReadLine2();
testChunkedConversion();
testCarry();
}
void testManyLines() {
@ -49,7 +51,7 @@ String _getLinesSliced(String str) {
const chunkSize = 3;
var index = 0;
while (index < str.length) {
var end = MATH.min(str.length, index + chunkSize);
var end = min(str.length, index + chunkSize);
sink.addSlice(str, index, end, false);
index += chunkSize;
@ -218,3 +220,71 @@ void testChunkedConversion() {
}
}
}
void testCarry() {
// Test, when multiple chunks contribute to the same line,
// That the carry-over is combined correctly.
// Test multiple chunks of carry, ending in linefeeds or not.
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq\r"); // Multiple chunks in carry, ends in \r.
sink.add("\nrstu"); // Followed by \n.
sink.close();
Expect.listEquals(["abcdfghijklmnopq", "rstu"], output);
}
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq\r"); // Multiple chunks in carry, ends in \r.
sink.add("rstu"); // Not followed by \n.
sink.close();
Expect.listEquals(["abcdfghijklmnopq", "rstu"], output);
}
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq\r"); // Multiple chunks in carry, ends in \r.
sink.close(); // Not followed by anything.
Expect.listEquals(["abcdfghijklmnopq"], output);
}
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq"); // Multiple chunks in carry, no linebreak.
sink.close();
Expect.listEquals(["abcdfghijklmnopq"], output);
}
}

View file

@ -0,0 +1,40 @@
// Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
// @dart = 2.9
library line_splitter_test;
import 'dart:convert';
import "package:expect/expect.dart";
void main() {
testEfficency();
}
/// Regression test for https://dartbug.com/51167
///
/// Had quadratic time behavior when concatenating chunks without linebreaks.
///
/// Should now only use linear time/space for buffering.
void testEfficency() {
// After fix: finishes in < 1 second on desktop.
// Before fix, with N = 100000, took 25 seconds.
const N = 1000000;
String result = ""; // Starts empty, set once.
var sink = LineSplitter()
.startChunkedConversion(ChunkedConversionSink.withCallback((lines) {
Expect.equals("", result);
Expect.equals(1, lines.length);
var line = lines.first;
Expect.notEquals("", line);
result = line;
}));
for (var i = 0; i < N; i++) {
sink.add("xy");
}
sink.close();
Expect.equals("xy" * N, result);
}

View file

@ -6,10 +6,11 @@
library line_splitter_test;
import "package:expect/expect.dart";
import 'dart:async';
import 'dart:convert';
import 'dart:math' as MATH;
import 'dart:math';
import "package:expect/expect.dart";
const lineTerminators = const ['\n', '\r', '\r\n'];
@ -21,6 +22,7 @@ void main() {
testReadLine1();
testReadLine2();
testChunkedConversion();
testCarry();
}
void testManyLines() {
@ -51,7 +53,7 @@ String _getLinesSliced(String str) {
const chunkSize = 3;
var index = 0;
while (index < str.length) {
var end = MATH.min(str.length, index + chunkSize);
var end = min(str.length, index + chunkSize);
sink.addSlice(str, index, end, false);
index += chunkSize;
@ -220,3 +222,71 @@ void testChunkedConversion() {
}
}
}
void testCarry() {
// Test, when multiple chunks contribute to the same line,
// That the carry-over is combined correctly.
// Test multiple chunks of carry, ending in linefeeds or not.
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq\r"); // Multiple chunks in carry, ends in \r.
sink.add("\nrstu"); // Followed by \n.
sink.close();
Expect.listEquals(["abcdfghijklmnopq", "rstu"], output);
}
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq\r"); // Multiple chunks in carry, ends in \r.
sink.add("rstu"); // Not followed by \n.
sink.close();
Expect.listEquals(["abcdfghijklmnopq", "rstu"], output);
}
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq\r"); // Multiple chunks in carry, ends in \r.
sink.close(); // Not followed by anything.
Expect.listEquals(["abcdfghijklmnopq"], output);
}
{
var output = [];
var splitter = new LineSplitter();
var outSink = new ChunkedConversionSink<String>.withCallback(output.addAll);
var sink = splitter.startChunkedConversion(outSink);
sink.add("abcd");
sink.add("fghi");
sink.add("jklm");
sink.add("nopq"); // Multiple chunks in carry, no linebreak.
sink.close();
Expect.listEquals(["abcdfghijklmnopq"], output);
}
}