Add A/B test mode to local devicelab runner (#54494)

* Add A/B test mode to local devicelab runner
This commit is contained in:
Yegor 2020-04-10 16:53:52 -07:00 committed by GitHub
parent 03c566768b
commit d119e5f1e4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 285 additions and 6 deletions

View file

@ -162,6 +162,47 @@ flags to `bin/run.dart`:
An example of a local engine architecture is `android_debug_unopt_x86`.
## Running an A/B test for engine changes
You can run an A/B test that compares the performance of the default engine
against a local engine build. The test runs the same benchmark a specified
number of times against both engines, then outputs a tab-separated spreadsheet
with the results. The results can be copied to a Google Spreadsheet for further
inspection.
Example:
```sh
../../bin/cache/dart-sdk/bin/dart bin/run.dart --ab=10 \
--local-engine=host_debug_unopt \
-t bin/tasks/web_benchmarks_canvaskit.dart
```
The `--ab=10` tells the runner to run an A/B test 10 times.
`--local-engine=host_debug_unopt` tells the A/B test to use the `host_debug_unopt`
engine build. `--local-engine` is required for A/B test.
A/B can run exactly one task. Multiple tasks are not supported.
Example output:
```
Score Average A (noise) Average B (noise) Speed-up
bench_card_infinite_scroll.canvaskit.drawFrameDuration.average 2900.20 (8.44%) 2426.70 (8.94%) 1.20x
bench_card_infinite_scroll.canvaskit.totalUiFrame.average 4964.00 (6.29%) 4098.00 (8.03%) 1.21x
draw_rect.canvaskit.windowRenderDuration.average 1959.45 (16.56%) 2286.65 (0.61%) 0.86x
draw_rect.canvaskit.sceneBuildDuration.average 1969.45 (16.37%) 2294.90 (0.58%) 0.86x
draw_rect.canvaskit.drawFrameDuration.average 5335.20 (17.59%) 6437.60 (0.59%) 0.83x
draw_rect.canvaskit.totalUiFrame.average 6832.00 (13.16%) 7932.00 (0.34%) 0.86x
```
The output contains averages and noises for each score. More importantly, it
contains the speed-up value, i.e. how much _faster_ is the local engine than
the default engine. Values less than 1.0 indicate a slow-down. For example,
0.5x means the local engine is twice as slow as the default engine, and 2.0x
means it's twice as fast. Higher is better.
# Reproducing broken builds locally
To reproduce the breakage locally `git checkout` the corresponding Flutter

View file

@ -9,18 +9,34 @@ import 'dart:io';
import 'package:args/args.dart';
import 'package:path/path.dart' as path;
import 'package:flutter_devicelab/framework/ab.dart';
import 'package:flutter_devicelab/framework/manifest.dart';
import 'package:flutter_devicelab/framework/runner.dart';
import 'package:flutter_devicelab/framework/utils.dart';
ArgResults args;
List<String> _taskNames = <String>[];
/// Suppresses standard output, prints only standard error output.
bool silent;
/// The build of the local engine to use.
///
/// Required for A/B test mode.
String localEngine;
/// The path to the engine "src/" directory.
String localEngineSrcPath;
/// Whether to exit on first test failure.
bool exitOnFirstTestFailure;
/// Runs tasks.
///
/// The tasks are chosen depending on the command-line options
/// (see [_argParser]).
Future<void> main(List<String> rawArgs) async {
ArgResults args;
try {
args = _argParser.parse(rawArgs);
} on FormatException catch (error) {
@ -55,10 +71,19 @@ Future<void> main(List<String> rawArgs) async {
return;
}
final bool silent = args['silent'] as bool;
final String localEngine = args['local-engine'] as String;
final String localEngineSrcPath = args['local-engine-src-path'] as String;
silent = args['silent'] as bool;
localEngine = args['local-engine'] as String;
localEngineSrcPath = args['local-engine-src-path'] as String;
exitOnFirstTestFailure = args['exit'] as bool;
if (args.wasParsed('ab')) {
await _runABTest();
} else {
await _runTasks();
}
}
Future<void> _runTasks() async {
for (final String taskName in _taskNames) {
section('Running task "$taskName"');
final Map<String, dynamic> result = await runTask(
@ -74,13 +99,73 @@ Future<void> main(List<String> rawArgs) async {
if (!(result['success'] as bool)) {
exitCode = 1;
if (args['exit'] as bool) {
if (exitOnFirstTestFailure) {
return;
}
}
}
}
Future<void> _runABTest() async {
final int runsPerTest = int.parse(args['ab'] as String);
if (_taskNames.length > 1) {
stderr.writeln('When running in A/B test mode exactly one task must be passed but got ${_taskNames.join(', ')}.\n');
stderr.writeln(_argParser.usage);
exit(1);
}
if (!args.wasParsed('local-engine')) {
stderr.writeln('When running in A/B test mode --local-engine is required.\n');
stderr.writeln(_argParser.usage);
exit(1);
}
final String taskName = _taskNames.single;
print('$taskName A/B test. Will run $runsPerTest times.');
final ABTest abTest = ABTest();
for (int i = 1; i <= runsPerTest; i++) {
section('Run #$i');
print('Running with the default engine (A)');
final Map<String, dynamic> defaultEngineResult = await runTask(
taskName,
silent: silent,
);
print('Default engine result:');
print(const JsonEncoder.withIndent(' ').convert(defaultEngineResult));
if (!(defaultEngineResult['success'] as bool)) {
stderr.writeln('Task failed on the default engine.');
exit(1);
}
abTest.addAResult(defaultEngineResult);
print('Running with the local engine (B)');
final Map<String, dynamic> localEngineResult = await runTask(
taskName,
silent: silent,
localEngine: localEngine,
localEngineSrcPath: localEngineSrcPath,
);
print('Task localEngineResult:');
print(const JsonEncoder.withIndent(' ').convert(localEngineResult));
if (!(localEngineResult['success'] as bool)) {
stderr.writeln('Task failed on the local engine.');
exit(1);
}
abTest.addBResult(localEngineResult);
}
print(abTest.printSummary());
}
void addTasks({
List<ManifestTask> tasks,
ArgResults args,
@ -132,6 +217,22 @@ final ArgParser _argParser = ArgParser()
}
},
)
..addOption(
'ab',
help: 'Runs an A/B test comparing the default engine with the local\n'
'engine build for one task. This option does not support running\n'
'multiple tasks. The value is the number of times to run the task.\n'
'The task is expected to be a benchmark that reports score keys.\n'
'The A/B test collects the metrics collected by the test and\n'
'produces a report containing averages, noise, and the speed-up\n'
'between the two engines. --local-engine is required when running\n'
'an A/B test.',
callback: (String value) {
if (value != null && int.tryParse(value) == null) {
throw ArgParserException('Option --ab must be a number, but was "$value".');
}
},
)
..addFlag(
'all',
abbr: 'a',
@ -152,7 +253,8 @@ final ArgParser _argParser = ArgParser()
help: 'Name of a build output within the engine out directory, if you\n'
'are building Flutter locally. Use this to select a specific\n'
'version of the engine if you have built multiple engine targets.\n'
'This path is relative to --local-engine-src-path/out.',
'This path is relative to --local-engine-src-path/out. This option\n'
'is required when running an A/B test (see the --ab option).',
)
..addFlag(
'list',

View file

@ -0,0 +1,136 @@
// Copyright 2014 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'dart:math' as math;
import 'package:meta/meta.dart';
/// Collects data from an A/B test and produces a summary for human evaluation.
///
/// See [printSummary] for more.
class ABTest {
final Map<String, List<double>> _aResults = <String, List<double>>{};
final Map<String, List<double>> _bResults = <String, List<double>>{};
/// Adds the result of a single A run of the benchmark.
///
/// The result may contain multiple score keys.
///
/// [result] is expected to be a serialization of [TaskResult].
void addAResult(Map<String, dynamic> result) {
_addResult(result, _aResults);
}
/// Adds the result of a single B run of the benchmark.
///
/// The result may contain multiple score keys.
///
/// [result] is expected to be a serialization of [TaskResult].
void addBResult(Map<String, dynamic> result) {
_addResult(result, _bResults);
}
/// Returns the summary as a tab-separated spreadsheet.
///
/// This value can be copied straight to a Google Spreadsheet for further analysis.
String printSummary() {
final Map<String, _ScoreSummary> summariesA = _summarize(_aResults);
final Map<String, _ScoreSummary> summariesB = _summarize(_bResults);
final Set<String> scoreKeyUnion = <String>{
...summariesA.keys,
...summariesB.keys,
};
final StringBuffer buffer = StringBuffer(
'Score\tAverage A (noise)\tAverage B (noise)\tSpeed-up\n',
);
for (final String scoreKey in scoreKeyUnion) {
final _ScoreSummary summaryA = summariesA[scoreKey];
final _ScoreSummary summaryB = summariesB[scoreKey];
buffer.write('$scoreKey\t');
if (summaryA != null) {
buffer.write('${summaryA.average.toStringAsFixed(2)} (${_ratioToPercent(summaryA.noise)})\t');
} else {
buffer.write('\t');
}
if (summaryB != null) {
buffer.write('${summaryB.average.toStringAsFixed(2)} (${_ratioToPercent(summaryB.noise)})\t');
} else {
buffer.write('\t');
}
if (summaryA != null && summaryB != null) {
buffer.write('${(summaryA.average / summaryB.average).toStringAsFixed(2)}x\t');
}
buffer.writeln();
}
return buffer.toString();
}
}
class _ScoreSummary {
_ScoreSummary({
@required this.average,
@required this.noise,
});
/// Average (arithmetic mean) of a series of values collected by a benchmark.
final double average;
/// The noise (standard deviation divided by [average]) in the collected
/// values.
final double noise;
}
void _addResult(Map<String, dynamic> result, Map<String, List<double>> results) {
final List<String> scoreKeys = (result['benchmarkScoreKeys'] as List<dynamic>).cast<String>();
final Map<String, dynamic> data = result['data'] as Map<String, dynamic>;
for (final String scoreKey in scoreKeys) {
final double score = (data[scoreKey] as num).toDouble();
results.putIfAbsent(scoreKey, () => <double>[]).add(score);
}
}
Map<String, _ScoreSummary> _summarize(Map<String, List<double>> results) {
return results.map<String, _ScoreSummary>((String scoreKey, List<double> values) {
final double average = _computeAverage(values);
return MapEntry<String, _ScoreSummary>(scoreKey, _ScoreSummary(
average: average,
// If the average is zero, the benchmark got the perfect score with no noise.
noise: average > 0
? _computeStandardDeviationForPopulation(values) / average
: 0.0,
));
});
}
/// Computes the arithmetic mean (or average) of given [values].
double _computeAverage(Iterable<double> values) {
final double sum = values.reduce((double a, double b) => a + b);
return sum / values.length;
}
/// Computes population standard deviation.
///
/// Unlike sample standard deviation, which divides by N - 1, this divides by N.
///
/// See also:
///
/// * https://en.wikipedia.org/wiki/Standard_deviation
double _computeStandardDeviationForPopulation(Iterable<double> population) {
final double mean = _computeAverage(population);
final double sumOfSquaredDeltas = population.fold<double>(
0.0,
(double previous, num value) => previous += math.pow(value - mean, 2),
);
return math.sqrt(sumOfSquaredDeltas / population.length);
}
String _ratioToPercent(double value) {
return '${(value * 100).toStringAsFixed(2)}%';
}