From 3519bec6c4c0ef0025a31fe103ef4d5f4951ab8a Mon Sep 17 00:00:00 2001 From: Jim Graham Date: Fri, 8 May 2020 00:34:02 -0700 Subject: [PATCH] Save results of A/B test runs in a JSON file for future processing (#56416) --- dev/devicelab/README.md | 26 ++++- dev/devicelab/bin/run.dart | 29 ++++- dev/devicelab/bin/summarize.dart | 84 ++++++++++++++ dev/devicelab/lib/framework/ab.dart | 167 +++++++++++++++++++++++++++- dev/devicelab/test/ab_test.dart | 3 +- 5 files changed, 300 insertions(+), 9 deletions(-) create mode 100644 dev/devicelab/bin/summarize.dart diff --git a/dev/devicelab/README.md b/dev/devicelab/README.md index 8cc00219073..4c6a062cc20 100644 --- a/dev/devicelab/README.md +++ b/dev/devicelab/README.md @@ -167,8 +167,10 @@ An example of a local engine architecture is `android_debug_unopt_x86`. You can run an A/B test that compares the performance of the default engine against a local engine build. The test runs the same benchmark a specified number of times against both engines, then outputs a tab-separated spreadsheet -with the results. The results can be copied to a Google Spreadsheet for further -inspection. +with the results and stores them in a JSON file for future reference. The +results can be copied to a Google Spreadsheet for further inspection and the +JSON file can be reprocessed with the summarize.dart command for more detailed +output. Example: @@ -183,6 +185,11 @@ The `--ab=10` tells the runner to run an A/B test 10 times. `--local-engine=host_debug_unopt` tells the A/B test to use the `host_debug_unopt` engine build. `--local-engine` is required for A/B test. +`--ab-result-file=filename` can be used to provide an alternate location to output +the JSON results file (defaults to `ABresults#.json`). A single `#` character can be +used to indicate where to insert a serial number if a file with that name already +exists, otherwise the file will be overwritten. + A/B can run exactly one task. Multiple tasks are not supported. Example output: @@ -203,6 +210,21 @@ the default engine. Values less than 1.0 indicate a slow-down. For example, 0.5x means the local engine is twice as slow as the default engine, and 2.0x means it's twice as fast. Higher is better. +Summarize tool example: + +```sh +../../bin/cache/dart-sdk/bin/dart bin/summarize.dart --[no-]tsv-table --[no-]raw-summary \ + ABresults.json ABresults1.json ABresults2.json ... +``` + +`--[no-]tsv-table` tells the tool to print the summary in a table with tabs for easy spreadsheet +entry. (defaults to on) + +`--[no-]raw-summary` tells the tool to print all per-run data collected by the A/B test formatted +with tabs for easy spreadsheet entry. (defaults to on) + +Multiple trailing filenames can be specified and each such results file will be processed in turn. + # Reproducing broken builds locally To reproduce the breakage locally `git checkout` the corresponding Flutter diff --git a/dev/devicelab/bin/run.dart b/dev/devicelab/bin/run.dart index 682bc3d3874..ac08b7a1030 100644 --- a/dev/devicelab/bin/run.dart +++ b/dev/devicelab/bin/run.dart @@ -125,7 +125,7 @@ Future _runABTest() async { print('$taskName A/B test. Will run $runsPerTest times.'); - final ABTest abTest = ABTest(); + final ABTest abTest = ABTest(localEngine, taskName); for (int i = 1; i <= runsPerTest; i++) { section('Run #$i'); @@ -168,6 +168,10 @@ Future _runABTest() async { print(abTest.printSummary()); } } + abTest.finalize(); + + final File jsonFile = _uniqueFile(args['ab-result-file'] as String ?? 'ABresults#.json'); + jsonFile.writeAsString(const JsonEncoder.withIndent(' ').convert(abTest.jsonMap)); if (!silent) { section('Raw results'); @@ -176,6 +180,23 @@ Future _runABTest() async { section('Final A/B results'); print(abTest.printSummary()); + + print(''); + print('Results saved to ${jsonFile.path}'); +} + +File _uniqueFile(String filenameTemplate) { + final List parts = filenameTemplate.split('#'); + if (parts.length != 2) { + return File(filenameTemplate); + } + File file = File(parts[0] + parts[1]); + int i = 1; + while (file.existsSync()) { + file = File(parts[0]+i.toString()+parts[1]); + i++; + } + return file; } void addTasks({ @@ -245,6 +266,12 @@ final ArgParser _argParser = ArgParser() } }, ) + ..addOption( + 'ab-result-file', + help: 'The filename in which to place the json encoded results of an A/B test.\n' + 'The filename may contain a single # character to be replaced by a sequence\n' + 'number if the name already exists.', + ) ..addFlag( 'all', abbr: 'a', diff --git a/dev/devicelab/bin/summarize.dart b/dev/devicelab/bin/summarize.dart new file mode 100644 index 00000000000..2a3b28aa4e8 --- /dev/null +++ b/dev/devicelab/bin/summarize.dart @@ -0,0 +1,84 @@ +// Copyright 2014 The Flutter Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import 'dart:convert'; +import 'dart:io'; + +import 'package:args/args.dart'; + +import 'package:flutter_devicelab/framework/ab.dart'; +import 'package:flutter_devicelab/framework/utils.dart'; + +String kRawSummaryOpt = 'raw-summary'; +String kTabTableOpt = 'tsv-table'; +String kAsciiTableOpt = 'ascii-table'; + +void _usage(String error) { + stderr.writeln(error); + stderr.writeln('Usage:\n'); + stderr.writeln(_argParser.usage); + exitCode = 1; +} + +Future main(List rawArgs) async { + ArgResults args; + try { + args = _argParser.parse(rawArgs); + } on FormatException catch (error) { + _usage('${error.message}\n'); + return; + } + + final List jsonFiles = args.rest.isNotEmpty ? args.rest : [ 'ABresults.json' ]; + + for (final String filename in jsonFiles) { + final File file = File(filename); + if (!file.existsSync()) { + _usage('File "$filename" does not exist'); + return; + } + + ABTest test; + try { + test = ABTest.fromJsonMap( + const JsonDecoder().convert(await file.readAsString()) as Map + ); + } catch(error) { + _usage('Could not parse json file "$filename"'); + return; + } + + if (args[kRawSummaryOpt] as bool) { + section('Raw results for "$filename"'); + print(test.rawResults()); + } + if (args[kTabTableOpt] as bool) { + section('A/B comparison for "$filename"'); + print(test.printSummary()); + } + if (args[kAsciiTableOpt] as bool) { + section('Formatted summary for "$filename"'); + print(test.asciiSummary()); + } + } +} + +/// Command-line options for the `summarize.dart` command. +final ArgParser _argParser = ArgParser() + ..addFlag( + kAsciiTableOpt, + defaultsTo: true, + help: 'Prints the summary in a table formatted nicely for terminal output.', + ) + ..addFlag( + kTabTableOpt, + defaultsTo: true, + help: 'Prints the summary in a table with tabs for easy spreadsheet entry.', + ) + ..addFlag( + kRawSummaryOpt, + defaultsTo: true, + help: 'Prints all per-run data collected by the A/B test formatted with\n' + 'tabs for easy spreadsheet entry.', + ); diff --git a/dev/devicelab/lib/framework/ab.dart b/dev/devicelab/lib/framework/ab.dart index 84ce835fa43..09b068e4540 100644 --- a/dev/devicelab/lib/framework/ab.dart +++ b/dev/devicelab/lib/framework/ab.dart @@ -5,12 +5,53 @@ import 'dart:math' as math; import 'package:meta/meta.dart'; +const String kBenchmarkTypeKeyName = 'benchmark_type'; +const String kBenchmarkVersionKeyName = 'version'; +const String kLocalEngineKeyName = 'local_engine'; +const String kTaskNameKeyName = 'task_name'; +const String kRunStartKeyName = 'run_start'; +const String kRunEndKeyName = 'run_end'; +const String kAResultsKeyName = 'default_results'; +const String kBResultsKeyName = 'local_engine_results'; + +const String kBenchmarkResultsType = 'A/B summaries'; +const String kBenchmarkABVersion = '1.0'; + +enum FieldJustification { LEFT, RIGHT, CENTER } + /// Collects data from an A/B test and produces a summary for human evaluation. /// /// See [printSummary] for more. class ABTest { - final Map> _aResults = >{}; - final Map> _bResults = >{}; + ABTest(this.localEngine, this.taskName) + : runStart = DateTime.now(), + _aResults = >{}, + _bResults = >{}; + + ABTest.fromJsonMap(Map jsonResults) + : localEngine = jsonResults[kLocalEngineKeyName] as String, + taskName = jsonResults[kTaskNameKeyName] as String, + runStart = DateTime.parse(jsonResults[kRunStartKeyName] as String), + _runEnd = DateTime.parse(jsonResults[kRunEndKeyName] as String), + _aResults = _convertFrom(jsonResults[kAResultsKeyName] as Map), + _bResults = _convertFrom(jsonResults[kBResultsKeyName] as Map); + + final String localEngine; + final String taskName; + final DateTime runStart; + DateTime _runEnd; + DateTime get runEnd => _runEnd; + + final Map> _aResults; + final Map> _bResults; + + static Map> _convertFrom(dynamic results) { + final Map resultMap = results as Map; + return > { + for (String key in resultMap.keys) + key: (resultMap[key] as List).cast() + }; + } /// Adds the result of a single A run of the benchmark. /// @@ -18,6 +59,9 @@ class ABTest { /// /// [result] is expected to be a serialization of [TaskResult]. void addAResult(Map result) { + if (_runEnd != null) { + throw StateError('Cannot add results to ABTest after it is finalized'); + } _addResult(result, _aResults); } @@ -27,9 +71,115 @@ class ABTest { /// /// [result] is expected to be a serialization of [TaskResult]. void addBResult(Map result) { + if (_runEnd != null) { + throw StateError('Cannot add results to ABTest after it is finalized'); + } _addResult(result, _bResults); } + void finalize() { + _runEnd = DateTime.now(); + } + + Map get jsonMap => { + kBenchmarkTypeKeyName: kBenchmarkResultsType, + kBenchmarkVersionKeyName: kBenchmarkABVersion, + kLocalEngineKeyName: localEngine, + kTaskNameKeyName: taskName, + kRunStartKeyName: runStart.toIso8601String(), + kRunEndKeyName: runEnd.toIso8601String(), + kAResultsKeyName: _aResults, + kBResultsKeyName: _bResults, + }; + + static void updateColumnLengths(List lengths, List results) { + for (int column = 0; column < lengths.length; column++) { + if (results[column] != null) { + lengths[column] = math.max(lengths[column], results[column].length); + } + } + } + + static void formatResult(StringBuffer buffer, + List lengths, + List aligns, + List values) { + for (int column = 0; column < lengths.length; column++) { + final int len = lengths[column]; + String value = values[column]; + if (value == null) { + value = ''.padRight(len); + } else { + switch (aligns[column]) { + case FieldJustification.LEFT: + value = value.padRight(len); + break; + case FieldJustification.RIGHT: + value = value.padLeft(len); + break; + case FieldJustification.CENTER: + value = value.padLeft((len + value.length) ~/2); + value = value.padRight(len); + break; + } + } + if (column > 0) { + value = value.padLeft(len+1); + } + buffer.write(value); + } + buffer.writeln(); + } + + /// Returns the summary as a tab-separated spreadsheet. + /// + /// This value can be copied straight to a Google Spreadsheet for further analysis. + String asciiSummary() { + final Map summariesA = _summarize(_aResults); + final Map summariesB = _summarize(_bResults); + + final List> tableRows = >[ + for (final String scoreKey in {...summariesA.keys, ...summariesB.keys}) + [ + scoreKey, + summariesA[scoreKey]?.averageString, summariesA[scoreKey]?.noiseString, + summariesB[scoreKey]?.averageString, summariesB[scoreKey]?.noiseString, + summariesA[scoreKey]?.improvementOver(summariesB[scoreKey]), + ], + ]; + + final List titles = [ + 'Score', + 'Average A', '(noise)', + 'Average B', '(noise)', + 'Speed-up' + ]; + final List alignments = [ + FieldJustification.LEFT, + FieldJustification.RIGHT, FieldJustification.LEFT, + FieldJustification.RIGHT, FieldJustification.LEFT, + FieldJustification.CENTER + ]; + + final List lengths = List.filled(6, 0); + updateColumnLengths(lengths, titles); + for (final List row in tableRows) { + updateColumnLengths(lengths, row); + } + + final StringBuffer buffer = StringBuffer(); + formatResult(buffer, lengths, + [ + FieldJustification.CENTER, + ...alignments.skip(1), + ], titles); + for (final List row in tableRows) { + formatResult(buffer, lengths, alignments, row); + } + + return buffer.toString(); + } + /// Returns unprocessed data collected by the A/B test formatted as /// a tab-separated spreadsheet. String rawResults() { @@ -83,19 +233,19 @@ class ABTest { buffer.write('$scoreKey\t'); if (summaryA != null) { - buffer.write('${summaryA.average.toStringAsFixed(2)} (${_ratioToPercent(summaryA.noise)})\t'); + buffer.write('${summaryA.averageString} ${summaryA.noiseString}\t'); } else { buffer.write('\t'); } if (summaryB != null) { - buffer.write('${summaryB.average.toStringAsFixed(2)} (${_ratioToPercent(summaryB.noise)})\t'); + buffer.write('${summaryB.averageString} ${summaryB.noiseString}\t'); } else { buffer.write('\t'); } if (summaryA != null && summaryB != null) { - buffer.write('${(summaryA.average / summaryB.average).toStringAsFixed(2)}x\t'); + buffer.write('${summaryA.improvementOver(summaryB)}\t'); } buffer.writeln(); @@ -117,6 +267,13 @@ class _ScoreSummary { /// The noise (standard deviation divided by [average]) in the collected /// values. final double noise; + + String get averageString => average.toStringAsFixed(2); + String get noiseString => '(${_ratioToPercent(noise)})'; + + String improvementOver(_ScoreSummary other) { + return other == null ? '' : '${(average / other.average).toStringAsFixed(2)}x'; + } } void _addResult(Map result, Map> results) { diff --git a/dev/devicelab/test/ab_test.dart b/dev/devicelab/test/ab_test.dart index d87222aa8c0..35ad195b2e2 100644 --- a/dev/devicelab/test/ab_test.dart +++ b/dev/devicelab/test/ab_test.dart @@ -8,7 +8,7 @@ import 'common.dart'; void main() { test('ABTest', () { - final ABTest ab = ABTest(); + final ABTest ab = ABTest('engine', 'test'); for (int i = 0; i < 5; i++) { ab.addAResult({ @@ -28,6 +28,7 @@ void main() { 'benchmarkScoreKeys': ['i', 'k'], }); } + ab.finalize(); expect( ab.rawResults(),