dart-sdk/benchmarks/MemoryCopy/dart/MemoryCopy.dart
Tess Strickland 4af0a59e8c [benchmarks] Change the MemoryCopy memcpy benchmarks to use memmove.
While our benchmarks don't involve overlapping memory between source
and destination, general methods for copying between TypedData must.
Thus, our benchmark for using the C interface via FFI must use memmove
instead of memcpy.

To avoid having to update our benchmark configurations, the name of
that benchmark is unchanged.

In addition, this CL adds filtering for benchmark names and turning on
and off specific outputs for quick comparisons when running manually.

Change-Id: I20616549d8bc9ab481884846d3f13df20a3c854e
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/319981
Commit-Queue: Tess Strickland <sstrickl@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
2023-08-11 13:17:17 +00:00

479 lines
14 KiB
Dart

// Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
// Micro-benchmarks for copying typed data lists.
import 'dart:ffi';
import 'dart:math';
import 'dart:typed_data';
import 'package:args/args.dart';
import 'package:ffi/ffi.dart';
const maxSizeInBytes = 10 * 1024 * 1024;
final argParser = ArgParser()
..addMultiOption('length',
abbr: 'l',
help: 'Byte length to benchmark',
valueHelp: 'INT',
defaultsTo: const [])
..addFlag('mebibytes-per-second',
abbr: 'm', help: 'Show MiB/s', defaultsTo: false)
..addFlag('nanoseconds-per-byte',
abbr: 'n', help: 'Show ns/byte', defaultsTo: false)
..addFlag('bytes-per-second',
abbr: 'b', help: 'Show byte/s', defaultsTo: true)
..addFlag('verbose', abbr: 'v', help: 'Verbose output', defaultsTo: false)
..addFlag('aligned',
abbr: 'a', help: 'Align results on initial numbers', defaultsTo: false);
class Emitter {
final bool bytesPerSecond;
final bool nanosecondsPerByte;
final bool mebibytesPerSecond;
final bool _alignedOutput;
Emitter(ArgResults results)
: bytesPerSecond = results['bytes-per-second'] || results['verbose'],
nanosecondsPerByte =
results['nanoseconds-per-byte'] || results['verbose'],
mebibytesPerSecond =
results['mebibytes-per-second'] || results['verbose'],
_alignedOutput = results['aligned'];
static final kValueRegexp = RegExp(r'^([0-9]+)');
static final kMaxLabelLength =
'MemoryCopy.1048576.setRange.TypedData.Double(NanosecondsPerChar)'.length;
// Maximum expected number of digits on either side of the decimal point.
static final kMaxDigits = 16;
void printLabeledValue(String label, double value) {
final valueString = value.toString();
final buffer = StringBuffer();
buffer
..write(label)
..write(': ');
if (_alignedOutput) {
final matches = kValueRegexp.firstMatch(valueString)!;
final valuePadding = (kMaxLabelLength - label.length) +
max<int>(kMaxDigits - matches[1]!.length, 0);
buffer..write(' ' * valuePadding);
}
buffer.write(valueString);
print(buffer.toString());
}
}
// A modified version of BenchmarkBase from package:benchmark_harness where
// - the run() method takes a number of rounds, so that there is only one run()
// call per measurement and thus the overhead of calling the run() method is
// the same across subclass results.
// - the measureFor() method returns the number of bytes transfered per second,
// not the number of microseconds per iteration (round).
abstract class MemoryCopyBenchmark {
final String name;
final int bytes;
MemoryCopyBenchmark(String name, this.bytes) : name = 'MemoryCopy.$name';
static const targetBatchSizeInBytes = 32 * 1024;
// Returns the number of bytes copied per second.
double measureFor(Duration minDuration) {
// The logic below is based off of BenchmarkBase._measureForImpl.
// We can't use BenchmarkBase.measureFor directly, because
// * it calls the function in a loop instead of passing the number of
// desired iterations to the function being called. Here, method
// invocation would dominate the actual body for small byte counts.
// * it doesn't provide the caller with the number of iterations performed,
// which we need to calculate the number of bytes transferred.
// Start off with enough rounds to ensure a minimum number of bytes copied
// per run() invocation.
int rounds = max(targetBatchSizeInBytes ~/ bytes, 1);
// If running a long measurement permit some amount of measurement jitter
// to avoid discarding results that almost, but not quite, reach the minimum
// duration requested.
final allowedJitter = Duration(
microseconds: minDuration.inSeconds > 0
? (minDuration.inMicroseconds * 0.1).floor()
: 0);
final watch = Stopwatch()..start();
while (true) {
// Try running for the current number of rounds and see if that reaches
// the minimum duration requested, so we only get the elapsed time from
// the StopWatch once for the final results used.
watch.reset();
run(rounds);
final elapsed = watch.elapsed;
final numberOfBytesCopied = rounds * bytes;
if (elapsed >= (minDuration - allowedJitter)) {
return (numberOfBytesCopied / elapsed.inMicroseconds) *
Duration.microsecondsPerSecond;
}
// If not, then adjust our estimate of how many iterations are needed to
// reach the minimum and try again.
if (elapsed.inMilliseconds == 0) {
rounds *= 1000;
} else {
rounds *= (minDuration.inMicroseconds / elapsed.inMicroseconds).ceil();
}
}
}
double measure() {
setup();
// Warmup for 100 ms.
measureFor(const Duration(milliseconds: 100));
// Run benchmark for 1 second.
final double result = measureFor(const Duration(seconds: 1));
teardown();
return result;
}
void report(Emitter emitter) {
final bytesPerSecond = measure();
if (emitter.bytesPerSecond) {
emitter.printLabeledValue('$name(BytesPerSecond)', bytesPerSecond);
}
if (emitter.nanosecondsPerByte) {
const nanoSecondsPerSecond = 1000 * 1000 * 1000;
final nanosecondsPerByte = nanoSecondsPerSecond / bytesPerSecond;
emitter.printLabeledValue(
'$name(NanosecondsPerChar)', nanosecondsPerByte);
}
if (emitter.mebibytesPerSecond) {
const bytesPerMebibyte = 1024 * 1024;
final mibPerSecond = bytesPerSecond / bytesPerMebibyte;
emitter.printLabeledValue('$name(MebibytesPerSecond)', mibPerSecond);
}
}
void setup();
void teardown();
void run(int rounds);
}
abstract class Uint8ListCopyBenchmark extends MemoryCopyBenchmark {
final int count;
late Uint8List input;
late Uint8List result;
Uint8ListCopyBenchmark(String method, int bytes)
: count = bytes,
super('$bytes.$method.TypedData.Uint8', bytes);
@override
void setup() {
input = Uint8List(count);
for (int i = 0; i < count; ++i) {
input[i] = (i + 3) & 0xff;
}
result = Uint8List(maxSizeInBytes);
}
@override
void teardown() {
for (int i = 0; i < count; ++i) {
final expected = (i + 3) & 0xff;
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
final expected = 0;
for (int i = count; i < maxSizeInBytes; ++i) {
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
}
}
class Uint8ListCopyViaLoopBenchmark extends Uint8ListCopyBenchmark {
Uint8ListCopyViaLoopBenchmark(int bytes) : super('loop', bytes);
@override
void run(int rounds) {
final count = this.count;
final input = this.input;
final result = this.result;
for (int r = 0; r < rounds; r++) {
for (int i = 0; i < count; i++) {
result[i] = input[i];
}
}
}
}
class Uint8ListCopyViaSetRangeBenchmark extends Uint8ListCopyBenchmark {
Uint8ListCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes);
@override
void run(int rounds) {
for (int r = 0; r < rounds; r++) {
result.setRange(0, count, input);
}
}
}
abstract class Float64ListCopyBenchmark extends MemoryCopyBenchmark {
final int count;
late Float64List input;
late Float64List result;
Float64ListCopyBenchmark(String method, int bytes)
: count = bytes ~/ 8,
super('$bytes.$method.TypedData.Double', bytes);
static const maxSizeInElements = maxSizeInBytes ~/ 8;
@override
void setup() {
input = Float64List(count);
for (int i = 0; i < count; ++i) {
input[i] = (i - 7).toDouble();
}
result = Float64List(maxSizeInElements);
}
@override
void teardown() {
for (int i = 0; i < count; ++i) {
final expected = (i - 7).toDouble();
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
final expected = 0.0;
for (int i = count; i < maxSizeInElements; ++i) {
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
}
}
class Float64ListCopyViaLoopBenchmark extends Float64ListCopyBenchmark {
Float64ListCopyViaLoopBenchmark(int bytes) : super('loop', bytes);
@override
void run(int rounds) {
final count = this.count;
final input = this.input;
final result = this.result;
for (int r = 0; r < rounds; r++) {
for (int i = 0; i < count; i++) {
result[i] = input[i];
}
}
}
}
class Float64ListCopyViaSetRangeBenchmark extends Float64ListCopyBenchmark {
Float64ListCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes);
@override
void run(int rounds) {
for (int r = 0; r < rounds; r++) {
result.setRange(0, count, input);
}
}
}
abstract class PointerUint8CopyBenchmark extends MemoryCopyBenchmark {
final int count;
late Pointer<Uint8> input;
late Pointer<Uint8> result;
PointerUint8CopyBenchmark(String method, int bytes)
: count = bytes,
super('$bytes.$method.Pointer.Uint8', bytes);
@override
void setup() {
input = malloc<Uint8>(count);
for (var i = 0; i < count; ++i) {
input[i] = (i + 3) & 0xff;
}
result = calloc<Uint8>(maxSizeInBytes);
}
@override
void teardown() {
malloc.free(input);
for (var i = 0; i < count; ++i) {
final expected = (i + 3) & 0xff;
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
final expected = 0;
for (var i = count; i < maxSizeInBytes; ++i) {
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
calloc.free(result);
}
}
class PointerUint8CopyViaLoopBenchmark extends PointerUint8CopyBenchmark {
PointerUint8CopyViaLoopBenchmark(int bytes) : super('loop', bytes);
@override
void run(int rounds) {
// Compare the setRange version to looping using Pointer.[]/Pointer.[]=.
final count = this.count;
final input = this.input;
final result = this.result;
for (int r = 0; r < rounds; r++) {
for (int i = 0; i < count; i++) {
result[i] = input[i];
}
}
}
}
class PointerUint8CopyViaSetRangeBenchmark extends PointerUint8CopyBenchmark {
PointerUint8CopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes);
@override
void run(int rounds) {
for (int r = 0; r < rounds; r++) {
result
.asTypedList(maxSizeInBytes)
.setRange(0, count, input.asTypedList(count));
}
}
}
@Native<Void Function(Pointer<Void>, Pointer<Void>, Size)>(isLeaf: true)
external void memmove(Pointer<Void> to, Pointer<Void> from, int size);
class PointerUint8CopyViaMemmoveBenchmark extends PointerUint8CopyBenchmark {
// This particular benchmark was originally written using memcpy, but a
// better comparison is against memmove. While our benchmarks don't use
// to and from memory that overlaps, in general this case must be handled.
//
// In order to not have to change the benchmark suite in golem, we keep the
// old name for this result.
PointerUint8CopyViaMemmoveBenchmark(int bytes) : super('memcpy', bytes);
@override
void run(int rounds) {
for (int r = 0; r < rounds; r++) {
memmove(result.cast(), input.cast(), count);
}
}
}
abstract class PointerDoubleCopyBenchmark extends MemoryCopyBenchmark {
final int count;
late Pointer<Double> input;
late Pointer<Double> result;
PointerDoubleCopyBenchmark(String method, int bytes)
: count = bytes ~/ 8,
super('$bytes.$method.Pointer.Double', bytes);
static const maxSizeInElements = maxSizeInBytes ~/ 8;
@override
void setup() {
input = malloc<Double>(count);
for (var i = 0; i < count; ++i) {
input[i] = (i - 7).toDouble();
}
result = calloc<Double>(maxSizeInElements);
}
@override
void teardown() {
malloc.free(input);
for (var i = 0; i < count; ++i) {
final expected = (i - 7).toDouble();
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
final expected = 0.0;
for (var i = count; i < maxSizeInElements; ++i) {
if (result[i] != expected) {
throw 'Expected result[$i] = $expected, got ${result[i]}';
}
}
calloc.free(result);
}
}
class PointerDoubleCopyViaLoopBenchmark extends PointerDoubleCopyBenchmark {
PointerDoubleCopyViaLoopBenchmark(int bytes) : super('loop', bytes);
@override
void run(int rounds) {
// Compare the setRange version to looping using Pointer.[]/Pointer.[]=.
final count = this.count;
final input = this.input;
final result = this.result;
for (int r = 0; r < rounds; r++) {
for (int i = 0; i < count; i++) {
result[i] = input[i];
}
}
}
}
class PointerDoubleCopyViaSetRangeBenchmark extends PointerDoubleCopyBenchmark {
PointerDoubleCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes);
@override
void run(int rounds) {
for (int r = 0; r < rounds; r++) {
result
.asTypedList(PointerDoubleCopyBenchmark.maxSizeInElements)
.setRange(0, count, input.asTypedList(count));
}
}
}
final defaultLengthsInBytes = [8, 64, 512, 4 * 1024, 1024 * 1024];
void main(List<String> args) {
final results = argParser.parse(args);
List<int> lengthsInBytes = defaultLengthsInBytes;
final emitter = Emitter(results);
if (results['length'].isNotEmpty) {
lengthsInBytes = (results['length'] as List<String>)
.map(int.parse)
.where((i) => i <= maxSizeInBytes)
.toList();
}
final filter = results.rest.firstOrNull;
final benchmarks = [
for (int bytes in lengthsInBytes) ...[
PointerUint8CopyViaMemmoveBenchmark(bytes),
PointerUint8CopyViaLoopBenchmark(bytes),
PointerDoubleCopyViaLoopBenchmark(bytes),
Uint8ListCopyViaLoopBenchmark(bytes),
Float64ListCopyViaLoopBenchmark(bytes),
PointerUint8CopyViaSetRangeBenchmark(bytes),
PointerDoubleCopyViaSetRangeBenchmark(bytes),
Uint8ListCopyViaSetRangeBenchmark(bytes),
Float64ListCopyViaSetRangeBenchmark(bytes),
],
];
for (var bench in benchmarks) {
if (filter == null || bench.name.contains(filter)) {
bench.report(emitter);
}
}
}