From 2b8278689591fe027c434f0d01bcaf2bd44561c6 Mon Sep 17 00:00:00 2001
From: Karolin Varner <karo@cupdev.net>
Date: Thu, 16 Jan 2020 23:08:19 +0100
Subject: [PATCH] feat: Tests and basic benchmarks

---
 benchmark.sh          | 71 +++++++++++++++++++++++++++++++++++++++++++
 benchmark_results.txt |  2 ++
 readme.md             | 55 +++++++++++++++++++++++++++++++--
 test.sh               | 33 ++++++++++++++++++++
 test/expect_count.txt |  3 ++
 test/expect_uniq.txt  |  3 ++
 test/input.txt        |  6 ++++
 7 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 benchmark.sh
 create mode 100644 benchmark_results.txt
 create mode 100644 test.sh
 create mode 100644 test/expect_count.txt
 create mode 100644 test/expect_uniq.txt
 create mode 100644 test/input.txt

diff --git a/benchmark.sh b/benchmark.sh
new file mode 100644
index 0000000..458385e
--- /dev/null
+++ b/benchmark.sh
@@ -0,0 +1,71 @@
+#! /bin/sh
+
+set -e
+trap "exit" SIGINT SIGTERM # exit from loop
+
+cd "$(dirname "$0")"
+huniq2bin="./target/release/huniq"
+huniq1dir="./target/benchmark/huniq1"
+huniq1bin="${huniq1dir}/huniq"
+
+measure() {
+  env time -f'%e %M' "$@" >/dev/null
+}
+
+bench_rust() {
+  measure ./target/release/huniq "$@"
+}
+
+bench_cpp() {
+  measure "$huniq1bin" "$@"
+}
+
+bench_shell() {
+  if [[ "$@" = "" ]]; then
+    measure sort -u
+  else
+    {
+      measure sort | measure uniq
+    } 2>&1 | awk '
+      {
+        elapsed=$1;
+        mem+=$2;
+      }
+
+      END {
+        print(elapsed, mem);
+      }'
+  fi
+}
+
+main() {
+  test -e "$huniq2bin" || {
+    cargo build --release
+  }
+
+  test -e "$huniq1dir" || {
+    git clone --recursive "https://github.com/SoftwearDevelopment/huniq" "$huniq1dir"
+  } >&2
+
+  test -e "$huniq1bin" || {
+    cd "$huniq1dir"
+    make
+    cd -
+  } >&2
+
+  declare -A modeargs
+  modeargs[uniq]=""
+  modeargs[count]="-c"
+
+  while true; do
+    for mode in "uniq" "count"; do
+      for repeats in 1 2 5 10 50 100; do
+        for impl in rust cpp shell; do
+          echo -n "$mode $repeats $impl "
+          yes | head -n "$repeats" | while read _; do cat /usr/share/dict/*; done \
+            | "bench_${impl}" ${modeargs[${mode}]}
+        done
+      done
+    done
+  done
+}
diff --git a/benchmark_results.txt b/benchmark_results.txt
new file mode 100644
index 0000000..8eba86a
--- /dev/null
+++ b/benchmark_results.txt
@@ -0,0 +1,2 @@
+uniq 1 rust uniq 1 cpp uniq 1 shell uniq 2 rust uniq 2 cpp uniq 2 shell uniq 5 rust uniq 5 cpp uniq 5 shell uniq 10 rust uniq 10 cpp uniq 10 shell uniq 50 rust uniq 50 cpp uniq 50 shell uniq 100 rust uniq 100 cpp uniq 100 shell count 1 rust count 1 cpp count 1 shell 9.97 11332
+count 2 rust count 2 cpp 
\ No newline at end of file
diff --git a/readme.md b/readme.md
index 6f08c90..eae369a 100644
--- a/readme.md
+++ b/readme.md
@@ -26,8 +26,8 @@ in -c/count mode.
 
 ## Motivation
 
-Sorting is slow and requires lot's of memory. By using hash tables/hash sets instead of sorting
-the input huniq is generally faster and requires less memory than the combination of `sort` and `uniq.`
+Sorting is slow. By using hash tables/hash sets instead of sorting
+the input huniq is generally faster than `sort -u` or `sort | uniq -c`.
 
 ## Version History
 
@@ -37,6 +37,57 @@ Changes made in version 2:
 
 * The -d/-0 flags where added so you can specify custom delimiters
 * Completely rewritten in rust.
+* Version two was in the (admittedly very limited) between 1.4x 
+
+## Build
+
+```sh
+cargo build --release
+```
+
+To run the tests execute:
+
+```sh
+bash ./test.sh
+```
+
+## Benchmark
+
+You can use `bash ./benchmark.sh` to execute the benchmarks. They will execute until you manually abort them (e.g. by pressing Ctrl-C).
+
+From my very limited benchmarking results, I found that the rust implementation is between 1-2x faster than the C++ implementation and between 5-10 times
+faster than the `uniq/sort` standard implementation.
+
+Surprisingly, sort features the lowest memory usage. All three implementations' memory usage grow with the number of unique lines, and not the number
+of total lines, so sort probably manually optimizes for that. Sort's memory usage us about a third of the rust implementation…
+The difference in memory usage between the rust implementation and the C++ one is quite small; the C++ one uses around 10% less memory.
+
+The benchmark 
+
+```
+repetitions  implemetation  seconds  memory/kb
+1            rust              0.89      29568
+1            cpp               2.62      26080
+1            shell            10.21       9820
+2            rust              2.02      29604
+2            cpp               6.21      26036
+2            shell            34.33       9724
+5            rust              5.25      29548
+5            cpp              12.08      26076
+5            shell            72.30      10004
+10           rust             10.26      29548
+10           cpp              18.87      26128
+10           shell           151.40      10060
+50           rust             50.16      29548
+50           cpp              88.51      26096
+50           shell           675.88      11048
+100          rust             84.96      29604
+100          cpp             149.10      26048
+```
+
+## Future direction
+
+There is some potential for optimizations…like
 
 ## License
 
diff --git a/test.sh b/test.sh
new file mode 100644
index 0000000..0a79fbc
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,33 @@
+#! /bin/bash
+
+cd "$(dirname "$0")"
+bin="./target/debug/huniq"
+
+failures=0
+count=0
+
+assert() {
+  local desc="$1"; shift
+  local ref="$1"; shift
+
+  (( count++ ))
+  diff <(eval "$@") "$ref" >/dev/null || {
+    echo >&2 "Assertion failed \"$desc\": \`$@\`"
+    diff <(eval "$@") "$ref" >&2
+    (( failures++ ))
+  }
+}
+
+main() {
+  test -e "$huniq2bin" || {
+    cargo build
+  }
+
+  assert uniq test/expect_uniq.txt "$bin <test/input.txt"
+  assert count test/expect_count.txt "$bin -c <test/input.txt | sort -nr"
+
+  echo >&2 "$count tests $failures failures"
+  test "$failures" -eq 0
+}
+
+main
diff --git a/test/expect_count.txt b/test/expect_count.txt
new file mode 100644
index 0000000..9a9c459
--- /dev/null
+++ b/test/expect_count.txt
@@ -0,0 +1,3 @@
+3 hello
+2 bar
+1 foo
diff --git a/test/expect_uniq.txt b/test/expect_uniq.txt
new file mode 100644
index 0000000..a73df94
--- /dev/null
+++ b/test/expect_uniq.txt
@@ -0,0 +1,3 @@
+hello
+foo
+bar
diff --git a/test/input.txt b/test/input.txt
new file mode 100644
index 0000000..e60394d
--- /dev/null
+++ b/test/input.txt
@@ -0,0 +1,6 @@
+hello
+foo
+hello
+hello
+bar
+bar