mirror of
https://github.com/koraa/huniq
synced 2024-10-01 18:03:30 +00:00
feat: Tests and basic benchmarks
This commit is contained in:
parent
5ad413137b
commit
2b82786895
71
benchmark.sh
Normal file
71
benchmark.sh
Normal file
|
@ -0,0 +1,71 @@
|
|||
#! /bin/sh
|
||||
|
||||
set -e
|
||||
trap "exit" SIGINT SIGTERM # exit from loop
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
huniq2bin="./target/release/huniq"
|
||||
huniq1dir="./target/benchmark/huniq1"
|
||||
huniq1bin="${huniq1dir}/huniq"
|
||||
|
||||
measure() {
|
||||
env time -f'%e %M' "$@" >/dev/null
|
||||
}
|
||||
|
||||
bench_rust() {
|
||||
measure ./target/release/huniq "$@"
|
||||
}
|
||||
|
||||
bench_cpp() {
|
||||
measure "$huniq1bin" "$@"
|
||||
}
|
||||
|
||||
bench_shell() {
|
||||
if [[ "$@" = "" ]]; then
|
||||
measure sort -u
|
||||
else
|
||||
{
|
||||
measure sort | measure uniq
|
||||
} 2>&1 | awk '
|
||||
{
|
||||
elapsed=$1;
|
||||
mem+=$2;
|
||||
}
|
||||
|
||||
END {
|
||||
print(elapsed, mem);
|
||||
}'
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
test -e "$huniq2bin" || {
|
||||
cargo build --release
|
||||
}
|
||||
|
||||
test -e "$huniq1dir" || {
|
||||
git clone --recursive "https://github.com/SoftwearDevelopment/huniq" "$huniq1dir"
|
||||
} >&2
|
||||
|
||||
test -e "$huniq1bin" || {
|
||||
cd "$huniq1dir"
|
||||
make
|
||||
cd -
|
||||
} >&2
|
||||
|
||||
declare -A modeargs
|
||||
modeargs[uniq]=""
|
||||
modeargs[count]="-c"
|
||||
|
||||
while true; do
|
||||
for mode in "uniq" "count"; do
|
||||
for repeats in 1 2 5 10 50 100; do
|
||||
for impl in rust cpp shell; do
|
||||
echo -n "$mode $repeats $impl "
|
||||
yes | head -n "$repeats" | while read _; do cat /usr/share/dict/*; done \
|
||||
| "bench_${impl}" ${modeargs[${mode}]}
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
}
|
2
benchmark_results.txt
Normal file
2
benchmark_results.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
uniq 1 rust uniq 1 cpp uniq 1 shell uniq 2 rust uniq 2 cpp uniq 2 shell uniq 5 rust uniq 5 cpp uniq 5 shell uniq 10 rust uniq 10 cpp uniq 10 shell uniq 50 rust uniq 50 cpp uniq 50 shell uniq 100 rust uniq 100 cpp uniq 100 shell count 1 rust count 1 cpp count 1 shell 9.97 11332
|
||||
count 2 rust count 2 cpp
|
55
readme.md
55
readme.md
|
@ -26,8 +26,8 @@ in -c/count mode.
|
|||
|
||||
## Motivation
|
||||
|
||||
Sorting is slow and requires lot's of memory. By using hash tables/hash sets instead of sorting
|
||||
the input huniq is generally faster and requires less memory than the combination of `sort` and `uniq.`
|
||||
Sorting is slow. By using hash tables/hash sets instead of sorting
|
||||
the input huniq is generally faster than `sort -u` or `sort | uniq -c`.
|
||||
|
||||
## Version History
|
||||
|
||||
|
@ -37,6 +37,57 @@ Changes made in version 2:
|
|||
|
||||
* The -d/-0 flags where added so you can specify custom delimiters
|
||||
* Completely rewritten in rust.
|
||||
* Version two was in the (admittedly very limited) between 1.4x
|
||||
|
||||
## Build
|
||||
|
||||
```sh
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
To run the tests execute:
|
||||
|
||||
```sh
|
||||
bash ./test.sh
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
|
||||
You can use `bash ./benchmark.sh` to execute the benchmarks. They will execute until you manually abort them (e.g. by pressing Ctrl-C).
|
||||
|
||||
From my very limited benchmarking results, I found that the rust implementation is between 1-2x faster than the C++ implementation and between 5-10 times
|
||||
faster than the `uniq/sort` standard implementation.
|
||||
|
||||
Surprisingly, sort features the lowest memory usage. All three implementations' memory usage grow with the number of unique lines, and not the number
|
||||
of total lines, so sort probably manually optimizes for that. Sort's memory usage us about a third of the rust implementation…
|
||||
The difference in memory usage between the rust implementation and the C++ one is quite small; the C++ one uses around 10% less memory.
|
||||
|
||||
The benchmark
|
||||
|
||||
```
|
||||
repetitions implemetation seconds memory/kb
|
||||
1 rust 0.89 29568
|
||||
1 cpp 2.62 26080
|
||||
1 shell 10.21 9820
|
||||
2 rust 2.02 29604
|
||||
2 cpp 6.21 26036
|
||||
2 shell 34.33 9724
|
||||
5 rust 5.25 29548
|
||||
5 cpp 12.08 26076
|
||||
5 shell 72.30 10004
|
||||
10 rust 10.26 29548
|
||||
10 cpp 18.87 26128
|
||||
10 shell 151.40 10060
|
||||
50 rust 50.16 29548
|
||||
50 cpp 88.51 26096
|
||||
50 shell 675.88 11048
|
||||
100 rust 84.96 29604
|
||||
100 cpp 149.10 26048
|
||||
```
|
||||
|
||||
## Future direction
|
||||
|
||||
There is some potential for optimizations…like
|
||||
|
||||
## License
|
||||
|
||||
|
|
33
test.sh
Normal file
33
test.sh
Normal file
|
@ -0,0 +1,33 @@
|
|||
#! /bin/bash
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
bin="./target/debug/huniq"
|
||||
|
||||
failures=0
|
||||
count=0
|
||||
|
||||
assert() {
|
||||
local desc="$1"; shift
|
||||
local ref="$1"; shift
|
||||
|
||||
(( count++ ))
|
||||
diff <(eval "$@") "$ref" >/dev/null || {
|
||||
echo >&2 "Assertion failed \"$desc\": \`$@\`"
|
||||
diff <(eval "$@") "$ref" >&2
|
||||
(( failures++ ))
|
||||
}
|
||||
}
|
||||
|
||||
main() {
|
||||
test -e "$huniq2bin" || {
|
||||
cargo build
|
||||
}
|
||||
|
||||
assert uniq test/expect_uniq.txt "$bin <test/input.txt"
|
||||
assert count test/expect_count.txt "$bin -c <test/input.txt | sort -nr"
|
||||
|
||||
echo >&2 "$count tests $failures failures"
|
||||
test "$failures" -eq 0
|
||||
}
|
||||
|
||||
main
|
3
test/expect_count.txt
Normal file
3
test/expect_count.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
3 hello
|
||||
2 bar
|
||||
1 foo
|
3
test/expect_uniq.txt
Normal file
3
test/expect_uniq.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
hello
|
||||
foo
|
||||
bar
|
6
test/input.txt
Normal file
6
test/input.txt
Normal file
|
@ -0,0 +1,6 @@
|
|||
hello
|
||||
foo
|
||||
hello
|
||||
hello
|
||||
bar
|
||||
bar
|
Loading…
Reference in a new issue