From 2b8278689591fe027c434f0d01bcaf2bd44561c6 Mon Sep 17 00:00:00 2001 From: Karolin Varner Date: Thu, 16 Jan 2020 23:08:19 +0100 Subject: [PATCH] feat: Tests and basic benchmarks --- benchmark.sh | 71 +++++++++++++++++++++++++++++++++++++++++++ benchmark_results.txt | 2 ++ readme.md | 55 +++++++++++++++++++++++++++++++-- test.sh | 33 ++++++++++++++++++++ test/expect_count.txt | 3 ++ test/expect_uniq.txt | 3 ++ test/input.txt | 6 ++++ 7 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 benchmark.sh create mode 100644 benchmark_results.txt create mode 100644 test.sh create mode 100644 test/expect_count.txt create mode 100644 test/expect_uniq.txt create mode 100644 test/input.txt diff --git a/benchmark.sh b/benchmark.sh new file mode 100644 index 0000000..458385e --- /dev/null +++ b/benchmark.sh @@ -0,0 +1,71 @@ +#! /bin/sh + +set -e +trap "exit" SIGINT SIGTERM # exit from loop + +cd "$(dirname "$0")" +huniq2bin="./target/release/huniq" +huniq1dir="./target/benchmark/huniq1" +huniq1bin="${huniq1dir}/huniq" + +measure() { + env time -f'%e %M' "$@" >/dev/null +} + +bench_rust() { + measure ./target/release/huniq "$@" +} + +bench_cpp() { + measure "$huniq1bin" "$@" +} + +bench_shell() { + if [[ "$@" = "" ]]; then + measure sort -u + else + { + measure sort | measure uniq + } 2>&1 | awk ' + { + elapsed=$1; + mem+=$2; + } + + END { + print(elapsed, mem); + }' + fi +} + +main() { + test -e "$huniq2bin" || { + cargo build --release + } + + test -e "$huniq1dir" || { + git clone --recursive "https://github.com/SoftwearDevelopment/huniq" "$huniq1dir" + } >&2 + + test -e "$huniq1bin" || { + cd "$huniq1dir" + make + cd - + } >&2 + + declare -A modeargs + modeargs[uniq]="" + modeargs[count]="-c" + + while true; do + for mode in "uniq" "count"; do + for repeats in 1 2 5 10 50 100; do + for impl in rust cpp shell; do + echo -n "$mode $repeats $impl " + yes | head -n "$repeats" | while read _; do cat /usr/share/dict/*; done \ + | "bench_${impl}" ${modeargs[${mode}]} + done + done + done + done +} diff --git a/benchmark_results.txt b/benchmark_results.txt new file mode 100644 index 0000000..8eba86a --- /dev/null +++ b/benchmark_results.txt @@ -0,0 +1,2 @@ +uniq 1 rust uniq 1 cpp uniq 1 shell uniq 2 rust uniq 2 cpp uniq 2 shell uniq 5 rust uniq 5 cpp uniq 5 shell uniq 10 rust uniq 10 cpp uniq 10 shell uniq 50 rust uniq 50 cpp uniq 50 shell uniq 100 rust uniq 100 cpp uniq 100 shell count 1 rust count 1 cpp count 1 shell 9.97 11332 +count 2 rust count 2 cpp \ No newline at end of file diff --git a/readme.md b/readme.md index 6f08c90..eae369a 100644 --- a/readme.md +++ b/readme.md @@ -26,8 +26,8 @@ in -c/count mode. ## Motivation -Sorting is slow and requires lot's of memory. By using hash tables/hash sets instead of sorting -the input huniq is generally faster and requires less memory than the combination of `sort` and `uniq.` +Sorting is slow. By using hash tables/hash sets instead of sorting +the input huniq is generally faster than `sort -u` or `sort | uniq -c`. ## Version History @@ -37,6 +37,57 @@ Changes made in version 2: * The -d/-0 flags where added so you can specify custom delimiters * Completely rewritten in rust. +* Version two was in the (admittedly very limited) between 1.4x + +## Build + +```sh +cargo build --release +``` + +To run the tests execute: + +```sh +bash ./test.sh +``` + +## Benchmark + +You can use `bash ./benchmark.sh` to execute the benchmarks. They will execute until you manually abort them (e.g. by pressing Ctrl-C). + +From my very limited benchmarking results, I found that the rust implementation is between 1-2x faster than the C++ implementation and between 5-10 times +faster than the `uniq/sort` standard implementation. + +Surprisingly, sort features the lowest memory usage. All three implementations' memory usage grow with the number of unique lines, and not the number +of total lines, so sort probably manually optimizes for that. Sort's memory usage us about a third of the rust implementation… +The difference in memory usage between the rust implementation and the C++ one is quite small; the C++ one uses around 10% less memory. + +The benchmark + +``` +repetitions implemetation seconds memory/kb +1 rust 0.89 29568 +1 cpp 2.62 26080 +1 shell 10.21 9820 +2 rust 2.02 29604 +2 cpp 6.21 26036 +2 shell 34.33 9724 +5 rust 5.25 29548 +5 cpp 12.08 26076 +5 shell 72.30 10004 +10 rust 10.26 29548 +10 cpp 18.87 26128 +10 shell 151.40 10060 +50 rust 50.16 29548 +50 cpp 88.51 26096 +50 shell 675.88 11048 +100 rust 84.96 29604 +100 cpp 149.10 26048 +``` + +## Future direction + +There is some potential for optimizations…like ## License diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..0a79fbc --- /dev/null +++ b/test.sh @@ -0,0 +1,33 @@ +#! /bin/bash + +cd "$(dirname "$0")" +bin="./target/debug/huniq" + +failures=0 +count=0 + +assert() { + local desc="$1"; shift + local ref="$1"; shift + + (( count++ )) + diff <(eval "$@") "$ref" >/dev/null || { + echo >&2 "Assertion failed \"$desc\": \`$@\`" + diff <(eval "$@") "$ref" >&2 + (( failures++ )) + } +} + +main() { + test -e "$huniq2bin" || { + cargo build + } + + assert uniq test/expect_uniq.txt "$bin &2 "$count tests $failures failures" + test "$failures" -eq 0 +} + +main diff --git a/test/expect_count.txt b/test/expect_count.txt new file mode 100644 index 0000000..9a9c459 --- /dev/null +++ b/test/expect_count.txt @@ -0,0 +1,3 @@ +3 hello +2 bar +1 foo diff --git a/test/expect_uniq.txt b/test/expect_uniq.txt new file mode 100644 index 0000000..a73df94 --- /dev/null +++ b/test/expect_uniq.txt @@ -0,0 +1,3 @@ +hello +foo +bar diff --git a/test/input.txt b/test/input.txt new file mode 100644 index 0000000..e60394d --- /dev/null +++ b/test/input.txt @@ -0,0 +1,6 @@ +hello +foo +hello +hello +bar +bar