[CFE] Duplicate code finder experiment take #1

Experiment with tool that finds duplicate code.
This still leaves a lot to be desired, but has already found something,
e.g. https://dart-review.googlesource.com/c/sdk/+/257400

It will probably need some iterations before running it on more than
one file would make sense.

Change-Id: I687e63e0a9003a8350df6682db279050d407605a
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/257401
Reviewed-by: Johnni Winther <johnniwinther@google.com>
Commit-Queue: Jens Johansen <jensj@google.com>
This commit is contained in:
Jens Johansen 2022-09-16 08:56:50 +00:00 committed by Commit Bot
parent 3640b60c8c
commit ed4caaf6ec
3 changed files with 439 additions and 0 deletions

View file

@ -80,6 +80,7 @@ blorp
bold
boo
bots
boundaries
boundarykey
bowtie
boz
@ -285,6 +286,7 @@ forbidden
forces
foreground
forrest
forth
forty
fortytwo
foundation
@ -440,6 +442,7 @@ negatable
newworld
ninja
noisy
nomatch
nominal
nondefault
nonexisting
@ -627,6 +630,7 @@ subfolder
subtool
subtools
successes
suggest
summarization
summarized
sup
@ -640,6 +644,7 @@ ta
tails
templates
theoretically
thereby
thereof
thread
ticks

View file

@ -0,0 +1,363 @@
// Copyright (c) 2022, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
import 'dart:io';
import 'package:_fe_analyzer_shared/src/scanner/scanner.dart'
show ErrorToken, ScannerConfiguration, StringScanner;
import 'package:_fe_analyzer_shared/src/scanner/token.dart'
show BeginToken, SimpleToken, Token, TokenType;
class Duplicate {
final List<FromToUri> where;
final String example;
Duplicate(this.where, this.example);
@override
String toString() => "Duplicate[$example]";
}
class FromToUri {
final Uri uri;
final int startOffset;
final int endOffset;
FromToUri(this.uri, this.startOffset, this.endOffset);
}
class Line {
String content;
Uri uri;
int startOffset;
int endOffset;
Line? previous;
Line? next;
Line(
this.content, this.uri, this.startOffset, this.endOffset, this.previous) {
if (previous != null) {
previous!.next = this;
}
}
}
class ExtendedLines {
List<Line> startLines;
int lineCount;
ExtendedLines(this.startLines, this.lineCount);
}
class MultiMap<K, V> {
Map<K, List<V>> data = {};
void operator []=(K key, V value) {
List<V>? lookup = data[key];
if (lookup != null) {
lookup.add(value);
} else {
data[key] = [value];
}
}
}
void _indexLines(MultiMap<String, Line> mapped, Set<String> denyListed,
String data, Uri uri) {
// TODO(jensj): Work directly on scanned tokens, or use parser as well?
// * Should probably only operate on body content, and then not cross bracket
// boundaries in that it should suggest that "foo; } } bar;" is a duplicate
// -- while it might be duplicate, we can't replace that with a function
// call (to get rid of duplicate code).
// * If something is a string we could perhaps ignore the content so that
/*
assert(declaration.parent == _libraryTypeParameterScopeBuilder);
Map<String, Builder> members = declaration.members!;
Map<String, MemberBuilder> constructors = declaration.constructors!;
Map<String, MemberBuilder> setters = declaration.setters!;
Scope classScope = new Scope(
local: members,
setters: setters,
parent: scope.withTypeVariables(typeVariables),
debugName: "class $className",
isModifiable: false);
*/
// and
/*
assert(declaration.parent == _libraryTypeParameterScopeBuilder);
Map<String, Builder> members = declaration.members!;
Map<String, MemberBuilder> constructors = declaration.constructors!;
Map<String, MemberBuilder> setters = declaration.setters!;
Scope classScope = new Scope(
local: members,
setters: setters,
parent: scope.withTypeVariables(typeVariables),
debugName: "extension $extensionName",
isModifiable: false);
*/
// could match.
Token scannedToken = _scan(data);
if (scannedToken is ErrorToken) throw "Can't operate on erroneous data";
Token token = scannedToken;
StringBuffer sb = new StringBuffer();
String space = "";
int? startOffset;
List<Token> endGroups = [];
Line? previousLine;
void endLine(Token lastToken) {
String s = sb.toString();
int lineStart = startOffset!;
sb.clear();
space = "";
startOffset = null;
Line line = new Line(s, uri, lineStart, lastToken.charEnd, previousLine);
previousLine = line;
if (!denyListed.contains(s)) mapped[s] = line;
}
while (true) {
sb.write(space);
sb.write(token.lexeme);
space = " ";
startOffset ??= token.charOffset;
if (endGroups.isNotEmpty && endGroups.last == token) {
endLine(token);
endGroups.removeLast();
} else if (token is BeginToken &&
token.type == TokenType.OPEN_CURLY_BRACKET &&
token.endGroup != null) {
// End line on a "{".
endLine(token);
endGroups.add(token.endGroup!);
} else if (token is SimpleToken && token.type == TokenType.SEMICOLON) {
// End line on a ";".
endLine(token);
} else if (token.next!.isEof) {
endLine(token);
break;
}
token = token.next!;
}
}
void _extendIndexedLines(List<Line> lines, Set<Line> alreadyIncluded,
List<Duplicate> foundDuplicates, Set<String> denyListed) {
int length = lines.length;
if (length == 1) {
// The indexed line was only seen once. That's not a duplicate!
return;
}
// We move forward in the data, and if having found a -> b -> c
// after having processed 'a' we shouldn't process from 'b' too. We thus
// remove already included lines here.
for (Line line in lines) {
if (alreadyIncluded.contains(line)) {
length--;
}
}
if (length <= 1) {
// We've seen this line before, but all duplicates was included in another
// actual duplicate find. Don't process and report again!
return;
}
// Can this potential duplicate be extended?
List<ExtendedLines>? extended = _extend(lines,
existingMatchCount: 1, leastMatches: 3, denyListed: denyListed);
if (extended == null || extended.isEmpty) return;
for (ExtendedLines extendedLine in extended) {
for (Line line in extendedLine.startLines) {
int left = extendedLine.lineCount - 1;
Line l = line;
while (left > 0) {
alreadyIncluded.add(l);
left--;
l = l.next!;
}
}
List<FromToUri> where = [];
StringBuffer sb = new StringBuffer();
String? example;
for (Line firstLine in extendedLine.startLines) {
int left = extendedLine.lineCount - 1;
Line lastLine = firstLine;
if (example == null) sb.writeln(lastLine.content);
while (left > 0) {
left--;
lastLine = lastLine.next!;
if (example == null) sb.writeln(lastLine.content);
}
where.add(new FromToUri(
firstLine.uri, firstLine.startOffset, lastLine.endOffset));
example ??= sb.toString();
}
foundDuplicates.add(new Duplicate(where, example!));
}
}
List<Duplicate> findDuplicates(Map<Uri, String> data, {bool verbose = false}) {
MultiMap<String, Line> indexedLines = new MultiMap();
const Set<String> denyListed = const {"}", "return ;", ";", ") ;", "else {"};
for (MapEntry<Uri, String> entry in data.entries) {
_indexLines(indexedLines, denyListed, entry.value, entry.key);
}
// TODO(jensj): The already included approach is too simple. E.g.
/*
match1;
nomatch;
match0;
match1;
match2;
match3;
vs
match1;
nomatchX;
match0;
match1;
match2;
match3;
would first find match1 match2 match3 --- then match0 match1 match2 match3.
*/
Set<Line> alreadyIncluded = {};
List<Duplicate> result = [];
for (MapEntry<String, List<Line>> entry in indexedLines.data.entries) {
_extendIndexedLines(entry.value, alreadyIncluded, result, denyListed);
}
if (verbose) {
if (result.length == 0) {
print("Didn't find any duplicates.");
} else if (result.length == 1) {
print("Found 1 duplicate:");
} else {
print("Found ${result.length} duplicates:");
}
for (Duplicate duplicate in result) {
print("Found '${duplicate.example}' at:");
for (FromToUri where in duplicate.where) {
print("${where.uri}: ${where.startOffset} -> ${where.endOffset}");
}
print("----\n\n");
}
}
return result;
}
/// Given a list of lines that match, find duplicates that match on more lines,
/// thereby extending and possibly splitting the match.
List<ExtendedLines>? _extend(List<Line> lines,
{required int existingMatchCount,
required int leastMatches,
required Set<String> denyListed}) {
MultiMap<String, Line> mapped = new MultiMap();
// E.g. for this input
// a -> b1 -> c1 -> d1
// a -> b1 -> c1 -> d2
// a -> b2 -> c2 -> d1
// a -> b2 -> c2 -> d2
// a -> b3 -> c2 -> d1
// we'd like it to be split into
// [a, b1, c1] and [a, b2, c2]
for (Line line in lines) {
Line? next = line.next;
if (next != null) {
mapped[next.content] = next;
}
}
List<ExtendedLines>? result;
for (MapEntry<String, List<Line>> entry in mapped.data.entries) {
if (entry.value.length == 1) {
continue;
}
int newMatchCount = existingMatchCount + 1;
// Don't count e.g. '}' as an actual match -> require one additional match.
// Notice that we can't just not count it as that would destroy the count
// which we use to go back and forth between first and last matched line.
if (denyListed.contains(entry.key)) {
leastMatches++;
}
List<ExtendedLines>? extended = _extend(entry.value,
existingMatchCount: newMatchCount,
leastMatches: leastMatches,
denyListed: denyListed);
if (extended != null) {
// Was extended further.
(result ??= []).addAll(extended);
} else if (newMatchCount >= leastMatches) {
// Couldn't be extended further, but this was far enough.
(result ??= []).add(new ExtendedLines(
entry.value.map((Line endLine) {
Line line = endLine;
int back = existingMatchCount;
while (back > 0) {
line = line.previous!;
back--;
}
return line;
}).toList(),
newMatchCount));
}
}
return result;
}
Token _scan(String data) {
ScannerConfiguration scannerConfiguration = new ScannerConfiguration(
enableTripleShift: true,
enableExtensionMethods: true,
enableNonNullable: true,
forAugmentationLibrary: false);
StringScanner scanner =
new StringScanner(data, configuration: scannerConfiguration);
Token firstToken = scanner.tokenize();
return firstToken;
}
void main(List<String> args) {
if (args.isEmpty) {
args = [
Platform.script
.resolve("../lib/src/fasta/source/source_library_builder.dart")
.toFilePath()
];
}
bool printed = false;
for (String s in args) {
File f = new File(s);
if (!f.existsSync()) continue;
String data = f.readAsStringSync();
if (printed) print("\n\n=============\n\n");
print("Output on $s:");
findDuplicates({Uri.parse(s): data}, verbose: true);
printed = true;
}
}

View file

@ -0,0 +1,71 @@
// Copyright (c) 2022, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
import 'duplicate_code_finder_experiment.dart';
void main() {
exactlySameLine();
sameLineDifferentIndentation();
}
Uri testUri = Uri.parse("test://uri.dart");
void exactlySameLine() {
// Can find exactly the same lines.
List<Duplicate> result = findDuplicates({
testUri: """
if (whatever()) {
Foo f = new Foo();
f.partA();
f.partB();
f.partC();
} else {
print("Else case");
Foo f = new Foo();
f.partA();
f.partB();
f.partC();
}
""",
});
if (result.length != 1) throw "Didn't find exactly 1 result; got $result";
String s = result.single.toString();
if (!s.contains("Foo f = new Foo ( ) ;") ||
!s.contains("f . partA ( ) ;") ||
!s.contains("f . partB ( ) ;") ||
!s.contains("f . partC ( ) ;")) {
throw "Didn't contain expected, was $s";
}
}
void sameLineDifferentIndentation() {
// Can find exactly the same lines.
List<Duplicate> result = findDuplicates({
testUri: """
if (whatever()) {
Foo f = new Foo();
f.partA();
f.partB();
f.partC();
} else {
if (something()) {
print("Else case with something");
} else {
Foo f = new Foo();
f.partA();
f.partB();
f.partC();
}
}
""",
});
if (result.length != 1) throw "Didn't find exactly 1 result; got $result";
String s = result.single.toString();
if (!s.contains("Foo f = new Foo ( ) ;") ||
!s.contains("f . partA ( ) ;") ||
!s.contains("f . partB ( ) ;") ||
!s.contains("f . partC ( ) ;")) {
throw "Didn't contain expected, was $s";
}
}