[CFE] Duplicate code finder experiment take #1

Experiment with tool that finds duplicate code. This still leaves a lot to be desired, but has already found something, e.g. https://dart-review.googlesource.com/c/sdk/+/257400 It will probably need some iterations before running it on more than one file would make sense. Change-Id: I687e63e0a9003a8350df6682db279050d407605a Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/257401 Reviewed-by: Johnni Winther <johnniwinther@google.com> Commit-Queue: Jens Johansen <jensj@google.com>
2024-09-15 23:49:47 +00:00 · 2022-09-16 08:56:50 +00:00 · 2022-09-16 08:56:50 +00:00 · ed4caaf6ec
parent 3640b60c8c
commit ed4caaf6ec
3 changed files with 439 additions and 0 deletions
--- a/pkg/front_end/test/spell_checking_list_tests.txt
+++ b/pkg/front_end/test/spell_checking_list_tests.txt
@ -80,6 +80,7 @@ blorp
 bold
 boo
 bots
+boundaries
 boundarykey
 bowtie
 boz
@ -285,6 +286,7 @@ forbidden
 forces
 foreground
 forrest
+forth
 forty
 fortytwo
 foundation
@ -440,6 +442,7 @@ negatable
 newworld
 ninja
 noisy
+nomatch
 nominal
 nondefault
 nonexisting
@ -627,6 +630,7 @@ subfolder
 subtool
 subtools
 successes
+suggest
 summarization
 summarized
 sup
@ -640,6 +644,7 @@ ta
 tails
 templates
 theoretically
+thereby
 thereof
 thread
 ticks
--- a/pkg/front_end/tool/duplicate_code_finder_experiment.dart
+++ b/pkg/front_end/tool/duplicate_code_finder_experiment.dart
@ -0,0 +1,363 @@
+// Copyright (c) 2022, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+import 'dart:io';
+
+import 'package:_fe_analyzer_shared/src/scanner/scanner.dart'
+    show ErrorToken, ScannerConfiguration, StringScanner;
+
+import 'package:_fe_analyzer_shared/src/scanner/token.dart'
+    show BeginToken, SimpleToken, Token, TokenType;
+
+class Duplicate {
+  final List<FromToUri> where;
+  final String example;
+
+  Duplicate(this.where, this.example);
+
+  @override
+  String toString() => "Duplicate[$example]";
+}
+
+class FromToUri {
+  final Uri uri;
+  final int startOffset;
+  final int endOffset;
+
+  FromToUri(this.uri, this.startOffset, this.endOffset);
+}
+
+class Line {
+  String content;
+  Uri uri;
+  int startOffset;
+  int endOffset;
+  Line? previous;
+  Line? next;
+
+  Line(
+      this.content, this.uri, this.startOffset, this.endOffset, this.previous) {
+    if (previous != null) {
+      previous!.next = this;
+    }
+  }
+}
+
+class ExtendedLines {
+  List<Line> startLines;
+  int lineCount;
+
+  ExtendedLines(this.startLines, this.lineCount);
+}
+
+class MultiMap<K, V> {
+  Map<K, List<V>> data = {};
+
+  void operator []=(K key, V value) {
+    List<V>? lookup = data[key];
+    if (lookup != null) {
+      lookup.add(value);
+    } else {
+      data[key] = [value];
+    }
+  }
+}
+
+void _indexLines(MultiMap<String, Line> mapped, Set<String> denyListed,
+    String data, Uri uri) {
+  // TODO(jensj): Work directly on scanned tokens, or use parser as well?
+  // * Should probably only operate on body content, and then not cross bracket
+  //   boundaries in that it should suggest that "foo; } } bar;" is a duplicate
+  //   -- while it might be duplicate, we can't replace that with a function
+  //   call (to get rid of duplicate code).
+  // * If something is a string we could perhaps ignore the content so that
+/*
+    assert(declaration.parent == _libraryTypeParameterScopeBuilder);
+    Map<String, Builder> members = declaration.members!;
+    Map<String, MemberBuilder> constructors = declaration.constructors!;
+    Map<String, MemberBuilder> setters = declaration.setters!;
+
+    Scope classScope = new Scope(
+        local: members,
+        setters: setters,
+        parent: scope.withTypeVariables(typeVariables),
+        debugName: "class $className",
+        isModifiable: false);
+*/
+  //   and
+/*
+    assert(declaration.parent == _libraryTypeParameterScopeBuilder);
+    Map<String, Builder> members = declaration.members!;
+    Map<String, MemberBuilder> constructors = declaration.constructors!;
+    Map<String, MemberBuilder> setters = declaration.setters!;
+
+    Scope classScope = new Scope(
+        local: members,
+        setters: setters,
+        parent: scope.withTypeVariables(typeVariables),
+        debugName: "extension $extensionName",
+        isModifiable: false);
+*/
+  //   could match.
+  Token scannedToken = _scan(data);
+  if (scannedToken is ErrorToken) throw "Can't operate on erroneous data";
+  Token token = scannedToken;
+  StringBuffer sb = new StringBuffer();
+  String space = "";
+  int? startOffset;
+
+  List<Token> endGroups = [];
+  Line? previousLine;
+
+  void endLine(Token lastToken) {
+    String s = sb.toString();
+    int lineStart = startOffset!;
+    sb.clear();
+    space = "";
+    startOffset = null;
+
+    Line line = new Line(s, uri, lineStart, lastToken.charEnd, previousLine);
+    previousLine = line;
+    if (!denyListed.contains(s)) mapped[s] = line;
+  }
+
+  while (true) {
+    sb.write(space);
+    sb.write(token.lexeme);
+    space = " ";
+    startOffset ??= token.charOffset;
+
+    if (endGroups.isNotEmpty && endGroups.last == token) {
+      endLine(token);
+      endGroups.removeLast();
+    } else if (token is BeginToken &&
+        token.type == TokenType.OPEN_CURLY_BRACKET &&
+        token.endGroup != null) {
+      // End line on a "{".
+      endLine(token);
+      endGroups.add(token.endGroup!);
+    } else if (token is SimpleToken && token.type == TokenType.SEMICOLON) {
+      // End line on a ";".
+      endLine(token);
+    } else if (token.next!.isEof) {
+      endLine(token);
+      break;
+    }
+
+    token = token.next!;
+  }
+}
+
+void _extendIndexedLines(List<Line> lines, Set<Line> alreadyIncluded,
+    List<Duplicate> foundDuplicates, Set<String> denyListed) {
+  int length = lines.length;
+
+  if (length == 1) {
+    // The indexed line was only seen once. That's not a duplicate!
+    return;
+  }
+
+  // We move forward in the data, and if having found a -> b -> c
+  // after having processed 'a' we shouldn't process from 'b' too. We thus
+  // remove already included lines here.
+  for (Line line in lines) {
+    if (alreadyIncluded.contains(line)) {
+      length--;
+    }
+  }
+  if (length <= 1) {
+    // We've seen this line before, but all duplicates was included in another
+    // actual duplicate find. Don't process and report again!
+    return;
+  }
+
+  // Can this potential duplicate be extended?
+  List<ExtendedLines>? extended = _extend(lines,
+      existingMatchCount: 1, leastMatches: 3, denyListed: denyListed);
+  if (extended == null || extended.isEmpty) return;
+  for (ExtendedLines extendedLine in extended) {
+    for (Line line in extendedLine.startLines) {
+      int left = extendedLine.lineCount - 1;
+      Line l = line;
+      while (left > 0) {
+        alreadyIncluded.add(l);
+        left--;
+        l = l.next!;
+      }
+    }
+    List<FromToUri> where = [];
+    StringBuffer sb = new StringBuffer();
+    String? example;
+    for (Line firstLine in extendedLine.startLines) {
+      int left = extendedLine.lineCount - 1;
+      Line lastLine = firstLine;
+      if (example == null) sb.writeln(lastLine.content);
+      while (left > 0) {
+        left--;
+        lastLine = lastLine.next!;
+        if (example == null) sb.writeln(lastLine.content);
+      }
+      where.add(new FromToUri(
+          firstLine.uri, firstLine.startOffset, lastLine.endOffset));
+      example ??= sb.toString();
+    }
+    foundDuplicates.add(new Duplicate(where, example!));
+  }
+}
+
+List<Duplicate> findDuplicates(Map<Uri, String> data, {bool verbose = false}) {
+  MultiMap<String, Line> indexedLines = new MultiMap();
+  const Set<String> denyListed = const {"}", "return ;", ";", ") ;", "else {"};
+
+  for (MapEntry<Uri, String> entry in data.entries) {
+    _indexLines(indexedLines, denyListed, entry.value, entry.key);
+  }
+
+  // TODO(jensj): The already included approach is too simple. E.g.
+  /*
+match1;
+nomatch;
+
+match0;
+match1;
+match2;
+match3;
+
+vs
+
+match1;
+nomatchX;
+
+match0;
+match1;
+match2;
+match3;
+
+would first find match1 match2 match3 --- then match0 match1 match2 match3.
+*/
+
+  Set<Line> alreadyIncluded = {};
+  List<Duplicate> result = [];
+  for (MapEntry<String, List<Line>> entry in indexedLines.data.entries) {
+    _extendIndexedLines(entry.value, alreadyIncluded, result, denyListed);
+  }
+
+  if (verbose) {
+    if (result.length == 0) {
+      print("Didn't find any duplicates.");
+    } else if (result.length == 1) {
+      print("Found 1 duplicate:");
+    } else {
+      print("Found ${result.length} duplicates:");
+    }
+    for (Duplicate duplicate in result) {
+      print("Found '${duplicate.example}' at:");
+      for (FromToUri where in duplicate.where) {
+        print("${where.uri}: ${where.startOffset} -> ${where.endOffset}");
+      }
+      print("----\n\n");
+    }
+  }
+
+  return result;
+}
+
+/// Given a list of lines that match, find duplicates that match on more lines,
+/// thereby extending and possibly splitting the match.
+List<ExtendedLines>? _extend(List<Line> lines,
+    {required int existingMatchCount,
+    required int leastMatches,
+    required Set<String> denyListed}) {
+  MultiMap<String, Line> mapped = new MultiMap();
+
+  // E.g. for this input
+  // a -> b1 -> c1 -> d1
+  // a -> b1 -> c1 -> d2
+  // a -> b2 -> c2 -> d1
+  // a -> b2 -> c2 -> d2
+  // a -> b3 -> c2 -> d1
+  // we'd like it to be split into
+  // [a, b1, c1] and [a, b2, c2]
+
+  for (Line line in lines) {
+    Line? next = line.next;
+    if (next != null) {
+      mapped[next.content] = next;
+    }
+  }
+
+  List<ExtendedLines>? result;
+
+  for (MapEntry<String, List<Line>> entry in mapped.data.entries) {
+    if (entry.value.length == 1) {
+      continue;
+    }
+
+    int newMatchCount = existingMatchCount + 1;
+    // Don't count e.g. '}' as an actual match -> require one additional match.
+    // Notice that we can't just not count it as that would destroy the count
+    // which we use to go back and forth between first and last matched line.
+    if (denyListed.contains(entry.key)) {
+      leastMatches++;
+    }
+
+    List<ExtendedLines>? extended = _extend(entry.value,
+        existingMatchCount: newMatchCount,
+        leastMatches: leastMatches,
+        denyListed: denyListed);
+    if (extended != null) {
+      // Was extended further.
+      (result ??= []).addAll(extended);
+    } else if (newMatchCount >= leastMatches) {
+      // Couldn't be extended further, but this was far enough.
+      (result ??= []).add(new ExtendedLines(
+          entry.value.map((Line endLine) {
+            Line line = endLine;
+            int back = existingMatchCount;
+            while (back > 0) {
+              line = line.previous!;
+              back--;
+            }
+            return line;
+          }).toList(),
+          newMatchCount));
+    }
+  }
+
+  return result;
+}
+
+Token _scan(String data) {
+  ScannerConfiguration scannerConfiguration = new ScannerConfiguration(
+      enableTripleShift: true,
+      enableExtensionMethods: true,
+      enableNonNullable: true,
+      forAugmentationLibrary: false);
+
+  StringScanner scanner =
+      new StringScanner(data, configuration: scannerConfiguration);
+  Token firstToken = scanner.tokenize();
+  return firstToken;
+}
+
+void main(List<String> args) {
+  if (args.isEmpty) {
+    args = [
+      Platform.script
+          .resolve("../lib/src/fasta/source/source_library_builder.dart")
+          .toFilePath()
+    ];
+  }
+  bool printed = false;
+  for (String s in args) {
+    File f = new File(s);
+    if (!f.existsSync()) continue;
+    String data = f.readAsStringSync();
+
+    if (printed) print("\n\n=============\n\n");
+    print("Output on $s:");
+    findDuplicates({Uri.parse(s): data}, verbose: true);
+    printed = true;
+  }
+}
--- a/pkg/front_end/tool/duplicate_code_finder_test.dart
+++ b/pkg/front_end/tool/duplicate_code_finder_test.dart
@ -0,0 +1,71 @@
+// Copyright (c) 2022, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+import 'duplicate_code_finder_experiment.dart';
+
+void main() {
+  exactlySameLine();
+  sameLineDifferentIndentation();
+}
+
+Uri testUri = Uri.parse("test://uri.dart");
+
+void exactlySameLine() {
+  // Can find exactly the same lines.
+  List<Duplicate> result = findDuplicates({
+    testUri: """
+if (whatever()) {
+  Foo f = new Foo();
+  f.partA();
+  f.partB();
+  f.partC();
+} else {
+  print("Else case");
+  Foo f = new Foo();
+  f.partA();
+  f.partB();
+  f.partC();
+}
+""",
+  });
+  if (result.length != 1) throw "Didn't find exactly 1 result; got $result";
+  String s = result.single.toString();
+  if (!s.contains("Foo f = new Foo ( ) ;") ||
+      !s.contains("f . partA ( ) ;") ||
+      !s.contains("f . partB ( ) ;") ||
+      !s.contains("f . partC ( ) ;")) {
+    throw "Didn't contain expected, was $s";
+  }
+}
+
+void sameLineDifferentIndentation() {
+  // Can find exactly the same lines.
+  List<Duplicate> result = findDuplicates({
+    testUri: """
+if (whatever()) {
+  Foo f = new Foo();
+  f.partA();
+  f.partB();
+  f.partC();
+} else {
+  if (something()) {
+    print("Else case with something");
+  } else {
+    Foo f = new Foo();
+    f.partA();
+    f.partB();
+    f.partC();
+  }
+}
+""",
+  });
+  if (result.length != 1) throw "Didn't find exactly 1 result; got $result";
+  String s = result.single.toString();
+  if (!s.contains("Foo f = new Foo ( ) ;") ||
+      !s.contains("f . partA ( ) ;") ||
+      !s.contains("f . partB ( ) ;") ||
+      !s.contains("f . partC ( ) ;")) {
+    throw "Didn't contain expected, was $s";
+  }
+}