1
0
mirror of https://github.com/koraa/huniq synced 2024-07-01 07:14:25 +00:00

feat: Tests and basic benchmarks

This commit is contained in:
Karolin Varner 2020-01-16 23:08:19 +01:00
parent 5ad413137b
commit 2b82786895
7 changed files with 171 additions and 2 deletions

71
benchmark.sh Normal file
View File

@ -0,0 +1,71 @@
#! /bin/sh
set -e
trap "exit" SIGINT SIGTERM # exit from loop
cd "$(dirname "$0")"
huniq2bin="./target/release/huniq"
huniq1dir="./target/benchmark/huniq1"
huniq1bin="${huniq1dir}/huniq"
measure() {
env time -f'%e %M' "$@" >/dev/null
}
bench_rust() {
measure ./target/release/huniq "$@"
}
bench_cpp() {
measure "$huniq1bin" "$@"
}
bench_shell() {
if [[ "$@" = "" ]]; then
measure sort -u
else
{
measure sort | measure uniq
} 2>&1 | awk '
{
elapsed=$1;
mem+=$2;
}
END {
print(elapsed, mem);
}'
fi
}
main() {
test -e "$huniq2bin" || {
cargo build --release
}
test -e "$huniq1dir" || {
git clone --recursive "https://github.com/SoftwearDevelopment/huniq" "$huniq1dir"
} >&2
test -e "$huniq1bin" || {
cd "$huniq1dir"
make
cd -
} >&2
declare -A modeargs
modeargs[uniq]=""
modeargs[count]="-c"
while true; do
for mode in "uniq" "count"; do
for repeats in 1 2 5 10 50 100; do
for impl in rust cpp shell; do
echo -n "$mode $repeats $impl "
yes | head -n "$repeats" | while read _; do cat /usr/share/dict/*; done \
| "bench_${impl}" ${modeargs[${mode}]}
done
done
done
done
}

2
benchmark_results.txt Normal file
View File

@ -0,0 +1,2 @@
uniq 1 rust uniq 1 cpp uniq 1 shell uniq 2 rust uniq 2 cpp uniq 2 shell uniq 5 rust uniq 5 cpp uniq 5 shell uniq 10 rust uniq 10 cpp uniq 10 shell uniq 50 rust uniq 50 cpp uniq 50 shell uniq 100 rust uniq 100 cpp uniq 100 shell count 1 rust count 1 cpp count 1 shell 9.97 11332
count 2 rust count 2 cpp

View File

@ -26,8 +26,8 @@ in -c/count mode.
## Motivation
Sorting is slow and requires lot's of memory. By using hash tables/hash sets instead of sorting
the input huniq is generally faster and requires less memory than the combination of `sort` and `uniq.`
Sorting is slow. By using hash tables/hash sets instead of sorting
the input huniq is generally faster than `sort -u` or `sort | uniq -c`.
## Version History
@ -37,6 +37,57 @@ Changes made in version 2:
* The -d/-0 flags where added so you can specify custom delimiters
* Completely rewritten in rust.
* Version two was in the (admittedly very limited) between 1.4x
## Build
```sh
cargo build --release
```
To run the tests execute:
```sh
bash ./test.sh
```
## Benchmark
You can use `bash ./benchmark.sh` to execute the benchmarks. They will execute until you manually abort them (e.g. by pressing Ctrl-C).
From my very limited benchmarking results, I found that the rust implementation is between 1-2x faster than the C++ implementation and between 5-10 times
faster than the `uniq/sort` standard implementation.
Surprisingly, sort features the lowest memory usage. All three implementations' memory usage grow with the number of unique lines, and not the number
of total lines, so sort probably manually optimizes for that. Sort's memory usage us about a third of the rust implementation…
The difference in memory usage between the rust implementation and the C++ one is quite small; the C++ one uses around 10% less memory.
The benchmark
```
repetitions implemetation seconds memory/kb
1 rust 0.89 29568
1 cpp 2.62 26080
1 shell 10.21 9820
2 rust 2.02 29604
2 cpp 6.21 26036
2 shell 34.33 9724
5 rust 5.25 29548
5 cpp 12.08 26076
5 shell 72.30 10004
10 rust 10.26 29548
10 cpp 18.87 26128
10 shell 151.40 10060
50 rust 50.16 29548
50 cpp 88.51 26096
50 shell 675.88 11048
100 rust 84.96 29604
100 cpp 149.10 26048
```
## Future direction
There is some potential for optimizations…like
## License

33
test.sh Normal file
View File

@ -0,0 +1,33 @@
#! /bin/bash
cd "$(dirname "$0")"
bin="./target/debug/huniq"
failures=0
count=0
assert() {
local desc="$1"; shift
local ref="$1"; shift
(( count++ ))
diff <(eval "$@") "$ref" >/dev/null || {
echo >&2 "Assertion failed \"$desc\": \`$@\`"
diff <(eval "$@") "$ref" >&2
(( failures++ ))
}
}
main() {
test -e "$huniq2bin" || {
cargo build
}
assert uniq test/expect_uniq.txt "$bin <test/input.txt"
assert count test/expect_count.txt "$bin -c <test/input.txt | sort -nr"
echo >&2 "$count tests $failures failures"
test "$failures" -eq 0
}
main

3
test/expect_count.txt Normal file
View File

@ -0,0 +1,3 @@
3 hello
2 bar
1 foo

3
test/expect_uniq.txt Normal file
View File

@ -0,0 +1,3 @@
hello
foo
bar

6
test/input.txt Normal file
View File

@ -0,0 +1,6 @@
hello
foo
hello
hello
bar
bar