// Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file // for details. All rights reserved. Use of this source code is governed by a // BSD-style license that can be found in the LICENSE file. // Micro-benchmarks for copying typed data lists. import 'dart:ffi'; import 'dart:math'; import 'dart:typed_data'; import 'package:args/args.dart'; import 'package:ffi/ffi.dart'; const maxSizeInBytes = 10 * 1024 * 1024; final argParser = ArgParser() ..addMultiOption('length', abbr: 'l', help: 'Byte length to benchmark', valueHelp: 'INT', defaultsTo: const []) ..addFlag('mebibytes-per-second', abbr: 'm', help: 'Show MiB/s', defaultsTo: false) ..addFlag('nanoseconds-per-byte', abbr: 'n', help: 'Show ns/byte', defaultsTo: false) ..addFlag('bytes-per-second', abbr: 'b', help: 'Show byte/s', defaultsTo: true) ..addFlag('verbose', abbr: 'v', help: 'Verbose output', defaultsTo: false) ..addFlag('aligned', abbr: 'a', help: 'Align results on initial numbers', defaultsTo: false); class Emitter { final bool bytesPerSecond; final bool nanosecondsPerByte; final bool mebibytesPerSecond; final bool _alignedOutput; Emitter(ArgResults results) : bytesPerSecond = results['bytes-per-second'] || results['verbose'], nanosecondsPerByte = results['nanoseconds-per-byte'] || results['verbose'], mebibytesPerSecond = results['mebibytes-per-second'] || results['verbose'], _alignedOutput = results['aligned']; static final kValueRegexp = RegExp(r'^([0-9]+)'); static final kMaxLabelLength = 'MemoryCopy.1048576.setRange.TypedData.Double(NanosecondsPerChar)'.length; // Maximum expected number of digits on either side of the decimal point. static final kMaxDigits = 16; void printLabeledValue(String label, double value) { final valueString = value.toString(); final buffer = StringBuffer(); buffer ..write(label) ..write(': '); if (_alignedOutput) { final matches = kValueRegexp.firstMatch(valueString)!; final valuePadding = (kMaxLabelLength - label.length) + max(kMaxDigits - matches[1]!.length, 0); buffer..write(' ' * valuePadding); } buffer.write(valueString); print(buffer.toString()); } } // A modified version of BenchmarkBase from package:benchmark_harness where // - the run() method takes a number of rounds, so that there is only one run() // call per measurement and thus the overhead of calling the run() method is // the same across subclass results. // - the measureFor() method returns the number of bytes transfered per second, // not the number of microseconds per iteration (round). abstract class MemoryCopyBenchmark { final String name; final int bytes; MemoryCopyBenchmark(String name, this.bytes) : name = 'MemoryCopy.$name'; static const targetBatchSizeInBytes = 32 * 1024; // Returns the number of bytes copied per second. double measureFor(Duration minDuration) { // The logic below is based off of BenchmarkBase._measureForImpl. // We can't use BenchmarkBase.measureFor directly, because // * it calls the function in a loop instead of passing the number of // desired iterations to the function being called. Here, method // invocation would dominate the actual body for small byte counts. // * it doesn't provide the caller with the number of iterations performed, // which we need to calculate the number of bytes transferred. // Start off with enough rounds to ensure a minimum number of bytes copied // per run() invocation. int rounds = max(targetBatchSizeInBytes ~/ bytes, 1); // If running a long measurement permit some amount of measurement jitter // to avoid discarding results that almost, but not quite, reach the minimum // duration requested. final allowedJitter = Duration( microseconds: minDuration.inSeconds > 0 ? (minDuration.inMicroseconds * 0.1).floor() : 0); final watch = Stopwatch()..start(); while (true) { // Try running for the current number of rounds and see if that reaches // the minimum duration requested, so we only get the elapsed time from // the StopWatch once for the final results used. watch.reset(); run(rounds); final elapsed = watch.elapsed; final numberOfBytesCopied = rounds * bytes; if (elapsed >= (minDuration - allowedJitter)) { return (numberOfBytesCopied / elapsed.inMicroseconds) * Duration.microsecondsPerSecond; } // If not, then adjust our estimate of how many iterations are needed to // reach the minimum and try again. if (elapsed.inMilliseconds == 0) { rounds *= 1000; } else { rounds *= (minDuration.inMicroseconds / elapsed.inMicroseconds).ceil(); } } } double measure() { setup(); // Warmup for 100 ms. measureFor(const Duration(milliseconds: 100)); // Run benchmark for 1 second. final double result = measureFor(const Duration(seconds: 1)); teardown(); return result; } void report(Emitter emitter) { final bytesPerSecond = measure(); if (emitter.bytesPerSecond) { emitter.printLabeledValue('$name(BytesPerSecond)', bytesPerSecond); } if (emitter.nanosecondsPerByte) { const nanoSecondsPerSecond = 1000 * 1000 * 1000; final nanosecondsPerByte = nanoSecondsPerSecond / bytesPerSecond; emitter.printLabeledValue( '$name(NanosecondsPerChar)', nanosecondsPerByte); } if (emitter.mebibytesPerSecond) { const bytesPerMebibyte = 1024 * 1024; final mibPerSecond = bytesPerSecond / bytesPerMebibyte; emitter.printLabeledValue('$name(MebibytesPerSecond)', mibPerSecond); } } void setup(); void teardown(); void run(int rounds); } abstract class Uint8ListCopyBenchmark extends MemoryCopyBenchmark { final int count; late Uint8List input; late Uint8List result; Uint8ListCopyBenchmark(String method, int bytes) : count = bytes, super('$bytes.$method.TypedData.Uint8', bytes); @override void setup() { input = Uint8List(count); for (int i = 0; i < count; ++i) { input[i] = (i + 3) & 0xff; } result = Uint8List(maxSizeInBytes); } @override void teardown() { for (int i = 0; i < count; ++i) { final expected = (i + 3) & 0xff; if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } final expected = 0; for (int i = count; i < maxSizeInBytes; ++i) { if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } } } class Uint8ListCopyViaLoopBenchmark extends Uint8ListCopyBenchmark { Uint8ListCopyViaLoopBenchmark(int bytes) : super('loop', bytes); @override void run(int rounds) { final count = this.count; final input = this.input; final result = this.result; for (int r = 0; r < rounds; r++) { for (int i = 0; i < count; i++) { result[i] = input[i]; } } } } class Uint8ListCopyViaSetRangeBenchmark extends Uint8ListCopyBenchmark { Uint8ListCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); @override void run(int rounds) { for (int r = 0; r < rounds; r++) { result.setRange(0, count, input); } } } abstract class Float64ListCopyBenchmark extends MemoryCopyBenchmark { final int count; late Float64List input; late Float64List result; Float64ListCopyBenchmark(String method, int bytes) : count = bytes ~/ 8, super('$bytes.$method.TypedData.Double', bytes); static const maxSizeInElements = maxSizeInBytes ~/ 8; @override void setup() { input = Float64List(count); for (int i = 0; i < count; ++i) { input[i] = (i - 7).toDouble(); } result = Float64List(maxSizeInElements); } @override void teardown() { for (int i = 0; i < count; ++i) { final expected = (i - 7).toDouble(); if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } final expected = 0.0; for (int i = count; i < maxSizeInElements; ++i) { if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } } } class Float64ListCopyViaLoopBenchmark extends Float64ListCopyBenchmark { Float64ListCopyViaLoopBenchmark(int bytes) : super('loop', bytes); @override void run(int rounds) { final count = this.count; final input = this.input; final result = this.result; for (int r = 0; r < rounds; r++) { for (int i = 0; i < count; i++) { result[i] = input[i]; } } } } class Float64ListCopyViaSetRangeBenchmark extends Float64ListCopyBenchmark { Float64ListCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); @override void run(int rounds) { for (int r = 0; r < rounds; r++) { result.setRange(0, count, input); } } } abstract class PointerUint8CopyBenchmark extends MemoryCopyBenchmark { final int count; late Pointer input; late Pointer result; PointerUint8CopyBenchmark(String method, int bytes) : count = bytes, super('$bytes.$method.Pointer.Uint8', bytes); @override void setup() { input = malloc(count); for (var i = 0; i < count; ++i) { input[i] = (i + 3) & 0xff; } result = calloc(maxSizeInBytes); } @override void teardown() { malloc.free(input); for (var i = 0; i < count; ++i) { final expected = (i + 3) & 0xff; if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } final expected = 0; for (var i = count; i < maxSizeInBytes; ++i) { if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } calloc.free(result); } } class PointerUint8CopyViaLoopBenchmark extends PointerUint8CopyBenchmark { PointerUint8CopyViaLoopBenchmark(int bytes) : super('loop', bytes); @override void run(int rounds) { // Compare the setRange version to looping using Pointer.[]/Pointer.[]=. final count = this.count; final input = this.input; final result = this.result; for (int r = 0; r < rounds; r++) { for (int i = 0; i < count; i++) { result[i] = input[i]; } } } } class PointerUint8CopyViaSetRangeBenchmark extends PointerUint8CopyBenchmark { PointerUint8CopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); @override void run(int rounds) { for (int r = 0; r < rounds; r++) { result .asTypedList(maxSizeInBytes) .setRange(0, count, input.asTypedList(count)); } } } @Native, Pointer, Size)>(isLeaf: true) external void memmove(Pointer to, Pointer from, int size); class PointerUint8CopyViaMemmoveBenchmark extends PointerUint8CopyBenchmark { // This particular benchmark was originally written using memcpy, but a // better comparison is against memmove. While our benchmarks don't use // to and from memory that overlaps, in general this case must be handled. // // In order to not have to change the benchmark suite in golem, we keep the // old name for this result. PointerUint8CopyViaMemmoveBenchmark(int bytes) : super('memcpy', bytes); @override void run(int rounds) { for (int r = 0; r < rounds; r++) { memmove(result.cast(), input.cast(), count); } } } abstract class PointerDoubleCopyBenchmark extends MemoryCopyBenchmark { final int count; late Pointer input; late Pointer result; PointerDoubleCopyBenchmark(String method, int bytes) : count = bytes ~/ 8, super('$bytes.$method.Pointer.Double', bytes); static const maxSizeInElements = maxSizeInBytes ~/ 8; @override void setup() { input = malloc(count); for (var i = 0; i < count; ++i) { input[i] = (i - 7).toDouble(); } result = calloc(maxSizeInElements); } @override void teardown() { malloc.free(input); for (var i = 0; i < count; ++i) { final expected = (i - 7).toDouble(); if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } final expected = 0.0; for (var i = count; i < maxSizeInElements; ++i) { if (result[i] != expected) { throw 'Expected result[$i] = $expected, got ${result[i]}'; } } calloc.free(result); } } class PointerDoubleCopyViaLoopBenchmark extends PointerDoubleCopyBenchmark { PointerDoubleCopyViaLoopBenchmark(int bytes) : super('loop', bytes); @override void run(int rounds) { // Compare the setRange version to looping using Pointer.[]/Pointer.[]=. final count = this.count; final input = this.input; final result = this.result; for (int r = 0; r < rounds; r++) { for (int i = 0; i < count; i++) { result[i] = input[i]; } } } } class PointerDoubleCopyViaSetRangeBenchmark extends PointerDoubleCopyBenchmark { PointerDoubleCopyViaSetRangeBenchmark(int bytes) : super('setRange', bytes); @override void run(int rounds) { for (int r = 0; r < rounds; r++) { result .asTypedList(PointerDoubleCopyBenchmark.maxSizeInElements) .setRange(0, count, input.asTypedList(count)); } } } final defaultLengthsInBytes = [8, 64, 512, 4 * 1024, 1024 * 1024]; void main(List args) { final results = argParser.parse(args); List lengthsInBytes = defaultLengthsInBytes; final emitter = Emitter(results); if (results['length'].isNotEmpty) { lengthsInBytes = (results['length'] as List) .map(int.parse) .where((i) => i <= maxSizeInBytes) .toList(); } final filter = results.rest.firstOrNull; final benchmarks = [ for (int bytes in lengthsInBytes) ...[ PointerUint8CopyViaMemmoveBenchmark(bytes), PointerUint8CopyViaLoopBenchmark(bytes), PointerDoubleCopyViaLoopBenchmark(bytes), Uint8ListCopyViaLoopBenchmark(bytes), Float64ListCopyViaLoopBenchmark(bytes), PointerUint8CopyViaSetRangeBenchmark(bytes), PointerDoubleCopyViaSetRangeBenchmark(bytes), Uint8ListCopyViaSetRangeBenchmark(bytes), Float64ListCopyViaSetRangeBenchmark(bytes), ], ]; for (var bench in benchmarks) { if (filter == null || bench.name.contains(filter)) { bench.report(emitter); } } }