Sort: Implement stable sort, ignore non-printing, month sort dedup, auto parallel sort through rayon, zero terminated sort, check silent (#2008)

This commit is contained in:
electricboogie 2021-04-08 15:07:09 -05:00 committed by GitHub
parent b26e12eaa4
commit 8474249e5f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 1444 additions and 213 deletions

20
Cargo.lock generated
View file

@ -1362,12 +1362,6 @@ dependencies = [
"maybe-uninit",
]
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
version = "0.8.0"
@ -1522,17 +1516,6 @@ dependencies = [
"serde_json",
]
[[package]]
name = "twox-hash"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04f8ab788026715fa63b31960869617cba39117e520eb415b0139543e325ab59"
dependencies = [
"cfg-if 0.1.10",
"rand 0.7.3",
"static_assertions",
]
[[package]]
name = "typenum"
version = "1.13.0"
@ -2301,10 +2284,11 @@ name = "uu_sort"
version = "0.0.6"
dependencies = [
"clap",
"fnv",
"itertools 0.8.2",
"rand 0.7.3",
"rayon",
"semver",
"twox-hash",
"uucore",
"uucore_procs",
]

View file

@ -15,9 +15,10 @@ edition = "2018"
path = "src/sort.rs"
[dependencies]
rayon = "1.5"
rand = "0.7"
clap = "2.33"
twox-hash = "1.6.0"
fnv = "1.0.7"
itertools = "0.8.0"
semver = "0.9.0"
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }

View file

@ -7,23 +7,29 @@
// * file that was distributed with this source code.
#![allow(dead_code)]
// Although these links don't always seem to describe reality, check out the POSIX and GNU specs:
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
// spell-checker:ignore (ToDO) outfile nondictionary
#[macro_use]
extern crate uucore;
use clap::{App, Arg};
use fnv::FnvHasher;
use itertools::Itertools;
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use rayon::prelude::*;
use semver::Version;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::env;
use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
use std::mem::replace;
use std::path::Path;
use twox_hash::XxHash64;
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
static NAME: &str = "sort";
@ -33,27 +39,37 @@ static VERSION: &str = env!("CARGO_PKG_VERSION");
static OPT_HUMAN_NUMERIC_SORT: &str = "human-numeric-sort";
static OPT_MONTH_SORT: &str = "month-sort";
static OPT_NUMERIC_SORT: &str = "numeric-sort";
static OPT_GENERAL_NUMERIC_SORT: &str = "general-numeric-sort";
static OPT_VERSION_SORT: &str = "version-sort";
static OPT_DICTIONARY_ORDER: &str = "dictionary-order";
static OPT_MERGE: &str = "merge";
static OPT_CHECK: &str = "check";
static OPT_CHECK_SILENT: &str = "check-silent";
static OPT_IGNORE_CASE: &str = "ignore-case";
static OPT_IGNORE_BLANKS: &str = "ignore-blanks";
static OPT_IGNORE_NONPRINTING: &str = "ignore-nonprinting";
static OPT_OUTPUT: &str = "output";
static OPT_REVERSE: &str = "reverse";
static OPT_STABLE: &str = "stable";
static OPT_UNIQUE: &str = "unique";
static OPT_RANDOM: &str = "random-sort";
static OPT_ZERO_TERMINATED: &str = "zero-terminated";
static OPT_PARALLEL: &str = "parallel";
static OPT_FILES0_FROM: &str = "files0-from";
static ARG_FILES: &str = "files";
static DECIMAL_PT: char = '.';
static THOUSANDS_SEP: char = ',';
static NEGATIVE: char = '-';
static POSITIVE: char = '+';
#[derive(Eq, Ord, PartialEq, PartialOrd)]
enum SortMode {
Numeric,
HumanNumeric,
GeneralNumeric,
Month,
Version,
Default,
@ -67,10 +83,13 @@ struct Settings {
stable: bool,
unique: bool,
check: bool,
check_silent: bool,
random: bool,
compare_fns: Vec<fn(&str, &str) -> Ordering>,
compare_fn: fn(&str, &str) -> Ordering,
transform_fns: Vec<fn(&str) -> String>,
threads: String,
salt: String,
zero_terminated: bool,
}
impl Default for Settings {
@ -83,10 +102,13 @@ impl Default for Settings {
stable: false,
unique: false,
check: false,
check_silent: false,
random: false,
compare_fns: Vec::new(),
compare_fn: default_compare,
transform_fns: Vec::new(),
threads: String::new(),
salt: String::new(),
zero_terminated: false,
}
}
}
@ -206,6 +228,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.long(OPT_NUMERIC_SORT)
.help("compare according to string numerical value"),
)
.arg(
Arg::with_name(OPT_GENERAL_NUMERIC_SORT)
.short("g")
.long(OPT_GENERAL_NUMERIC_SORT)
.help("compare according to string general numerical value"),
)
.arg(
Arg::with_name(OPT_VERSION_SORT)
.short("V")
@ -230,12 +258,24 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.long(OPT_CHECK)
.help("check for sorted input; do not sort"),
)
.arg(
Arg::with_name(OPT_CHECK_SILENT)
.short("C")
.long(OPT_CHECK_SILENT)
.help("exit successfully if the given file is already sorted, and exit with status 1 otherwise. "),
)
.arg(
Arg::with_name(OPT_IGNORE_CASE)
.short("f")
.long(OPT_IGNORE_CASE)
.help("fold lower case to upper case characters"),
)
.arg(
Arg::with_name(OPT_IGNORE_NONPRINTING)
.short("-i")
.long(OPT_IGNORE_NONPRINTING)
.help("ignore nonprinting characters"),
)
.arg(
Arg::with_name(OPT_IGNORE_BLANKS)
.short("b")
@ -274,18 +314,65 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.long(OPT_UNIQUE)
.help("output only the first of an equal run"),
)
.arg(
Arg::with_name(OPT_ZERO_TERMINATED)
.short("z")
.long(OPT_ZERO_TERMINATED)
.help("line delimiter is NUL, not newline"),
)
.arg(
Arg::with_name(OPT_PARALLEL)
.long(OPT_PARALLEL)
.help("change the number of threads running concurrently to N")
.takes_value(true)
.value_name("NUM_THREADS"),
)
.arg(
Arg::with_name(OPT_FILES0_FROM)
.long(OPT_FILES0_FROM)
.help("read input from the files specified by NUL-terminated NUL_FILES")
.takes_value(true)
.value_name("NUL_FILES")
.multiple(true),
)
.arg(Arg::with_name(ARG_FILES).multiple(true).takes_value(true))
.get_matches_from(args);
let mut files: Vec<String> = matches
.values_of(ARG_FILES)
.map(|v| v.map(ToString::to_string).collect())
.unwrap_or_default();
// check whether user specified a zero terminated list of files for input, otherwise read files from args
let mut files: Vec<String> = if matches.is_present(OPT_FILES0_FROM) {
let files0_from: Vec<String> = matches
.values_of(OPT_FILES0_FROM)
.map(|v| v.map(ToString::to_string).collect())
.unwrap_or_default();
let mut files = Vec::new();
for path in &files0_from {
let (reader, _) = open(path.as_str()).expect("Could not read from file specified.");
let buf_reader = BufReader::new(reader);
for line in buf_reader.split(b'\0') {
if let Ok(n) = line {
files.push(
std::str::from_utf8(&n)
.expect("Could not parse zero terminated string from input.")
.to_string(),
);
}
}
}
files
} else {
matches
.values_of(ARG_FILES)
.map(|v| v.map(ToString::to_string).collect())
.unwrap_or_default()
};
settings.mode = if matches.is_present(OPT_HUMAN_NUMERIC_SORT) {
SortMode::HumanNumeric
} else if matches.is_present(OPT_MONTH_SORT) {
SortMode::Month
} else if matches.is_present(OPT_GENERAL_NUMERIC_SORT) {
SortMode::GeneralNumeric
} else if matches.is_present(OPT_NUMERIC_SORT) {
SortMode::Numeric
} else if matches.is_present(OPT_VERSION_SORT) {
@ -294,12 +381,29 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
SortMode::Default
};
if matches.is_present(OPT_DICTIONARY_ORDER) {
settings.transform_fns.push(remove_nondictionary_chars);
if matches.is_present(OPT_PARALLEL) {
// "0" is default - threads = num of cores
settings.threads = matches
.value_of(OPT_PARALLEL)
.map(String::from)
.unwrap_or("0".to_string());
env::set_var("RAYON_NUM_THREADS", &settings.threads);
}
if matches.is_present(OPT_DICTIONARY_ORDER) {
settings.transform_fns.push(remove_nondictionary_chars);
} else if matches.is_present(OPT_IGNORE_NONPRINTING) {
settings.transform_fns.push(remove_nonprinting_chars);
}
settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
settings.merge = matches.is_present(OPT_MERGE);
settings.check = matches.is_present(OPT_CHECK);
if matches.is_present(OPT_CHECK_SILENT) {
settings.check_silent = matches.is_present(OPT_CHECK_SILENT);
settings.check = true;
};
if matches.is_present(OPT_IGNORE_CASE) {
settings.transform_fns.push(|s| s.to_uppercase());
@ -327,20 +431,14 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
crash!(1, "sort: extra operand `{}' not allowed with -c", files[1])
}
settings.compare_fns.push(match settings.mode {
settings.compare_fn = match settings.mode {
SortMode::Numeric => numeric_compare,
SortMode::GeneralNumeric => general_numeric_compare,
SortMode::HumanNumeric => human_numeric_size_compare,
SortMode::Month => month_compare,
SortMode::Version => version_compare,
SortMode::Default => default_compare,
});
if !settings.stable {
match settings.mode {
SortMode::Default => {}
_ => settings.compare_fns.push(default_compare),
}
}
};
exec(files, &mut settings)
}
@ -359,67 +457,79 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
if settings.merge {
file_merger.push_file(buf_reader.lines());
} else if settings.check {
return exec_check_file(buf_reader.lines(), &settings);
} else if settings.zero_terminated {
for line in buf_reader.split(b'\0') {
if let Ok(n) = line {
lines.push(
std::str::from_utf8(&n)
.expect("Could not parse string from zero terminated input.")
.to_string(),
);
}
}
} else {
for line in buf_reader.lines() {
if let Ok(n) = line {
lines.push(n);
} else {
break;
}
}
}
}
sort_by(&mut lines, &settings);
if settings.check {
return exec_check_file(lines, &settings);
} else {
sort_by(&mut lines, &settings);
}
if settings.merge {
if settings.unique {
print_sorted(file_merger.dedup(), &settings.outfile)
print_sorted(file_merger.dedup(), &settings)
} else {
print_sorted(file_merger, &settings.outfile)
print_sorted(file_merger, &settings)
}
} else if settings.unique && settings.mode == SortMode::Numeric {
} else if settings.mode == SortMode::Month && settings.unique {
print_sorted(
lines
.iter()
.dedup_by(|a, b| num_sort_dedup(a) == num_sort_dedup(b)),
&settings.outfile,
.dedup_by(|a, b| get_months_dedup(a) == get_months_dedup(b)),
&settings,
)
} else if settings.unique {
print_sorted(lines.iter().dedup(), &settings.outfile)
print_sorted(
lines
.iter()
.dedup_by(|a, b| get_nums_dedup(a) == get_nums_dedup(b)),
&settings,
)
} else {
print_sorted(lines.iter(), &settings.outfile)
print_sorted(lines.iter(), &settings)
}
0
}
fn exec_check_file(lines: Lines<BufReader<Box<dyn Read>>>, settings: &Settings) -> i32 {
fn exec_check_file(unwrapped_lines: Vec<String>, settings: &Settings) -> i32 {
// errors yields the line before each disorder,
// plus the last line (quirk of .coalesce())
let unwrapped_lines = lines.filter_map(|maybe_line| {
if let Ok(line) = maybe_line {
Some(line)
} else {
None
}
});
let mut errors = unwrapped_lines
.enumerate()
.coalesce(|(last_i, last_line), (i, line)| {
if compare_by(&last_line, &line, &settings) == Ordering::Greater {
Err(((last_i, last_line), (i, line)))
} else {
Ok((i, line))
}
});
let mut errors =
unwrapped_lines
.iter()
.enumerate()
.coalesce(|(last_i, last_line), (i, line)| {
if compare_by(&last_line, &line, &settings) == Ordering::Greater {
Err(((last_i, last_line), (i, line)))
} else {
Ok((i, line))
}
});
if let Some((first_error_index, _line)) = errors.next() {
// Check for a second "error", as .coalesce() always returns the last
// line, no matter what our merging function does.
if let Some(_last_line_or_next_error) = errors.next() {
println!("sort: disorder in line {}", first_error_index);
if !settings.check_silent {
println!("sort: disorder in line {}", first_error_index);
};
1
} else {
// first "error" was actually the last line.
@ -431,8 +541,9 @@ fn exec_check_file(lines: Lines<BufReader<Box<dyn Read>>>, settings: &Settings)
}
}
#[inline(always)]
fn transform(line: &str, settings: &Settings) -> String {
let mut transformed = line.to_string();
let mut transformed = line.to_owned();
for transform_fn in &settings.transform_fns {
transformed = transform_fn(&transformed);
}
@ -440,8 +551,9 @@ fn transform(line: &str, settings: &Settings) -> String {
transformed
}
#[inline(always)]
fn sort_by(lines: &mut Vec<String>, settings: &Settings) {
lines.sort_by(|a, b| compare_by(a, b, &settings))
lines.par_sort_by(|a, b| compare_by(a, b, &settings))
}
fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
@ -454,72 +566,198 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
(a, b)
};
for compare_fn in &settings.compare_fns {
let cmp: Ordering = if settings.random {
random_shuffle(a, b, settings.salt.clone())
// 1st Compare
let mut cmp: Ordering = if settings.random {
random_shuffle(a, b, settings.salt.clone())
} else {
(settings.compare_fn)(a, b)
};
// Call "last resort compare" on any equal
if cmp == Ordering::Equal {
if settings.random || settings.stable || settings.unique {
cmp = Ordering::Equal
} else {
compare_fn(a, b)
cmp = default_compare(a, b)
};
if cmp != Ordering::Equal {
if settings.reverse {
return cmp.reverse();
} else {
return cmp;
}
}
};
if settings.reverse {
return cmp.reverse();
} else {
return cmp;
}
Ordering::Equal
}
// Test output against BSDs and GNU with their locale
// env var set to lc_ctype=utf-8 to enjoy the exact same output.
#[inline(always)]
fn default_compare(a: &str, b: &str) -> Ordering {
a.cmp(b)
}
fn get_leading_number(a: &str) -> &str {
// This function does the initial detection of numeric lines.
// Lines starting with a number or positive or negative sign.
// It also strips the string of any thing that could never
// be a number for the purposes of any type of numeric comparison.
#[inline(always)]
fn leading_num_common(a: &str) -> &str {
let mut s = "";
for c in a.chars() {
if !c.is_numeric() && !c.eq(&'-') && !c.eq(&' ') && !c.eq(&'.') && !c.eq(&',') {
s = a.trim().split(c).next().unwrap();
for (idx, c) in a.char_indices() {
// check whether char is numeric, whitespace or decimal point or thousand seperator
if !c.is_numeric()
&& !c.is_whitespace()
&& !c.eq(&DECIMAL_PT)
&& !c.eq(&THOUSANDS_SEP)
// check for e notation
&& !c.eq(&'e')
&& !c.eq(&'E')
// check whether first char is + or -
&& !a.chars().nth(0).unwrap_or('\0').eq(&POSITIVE)
&& !a.chars().nth(0).unwrap_or('\0').eq(&NEGATIVE)
{
// Strip string of non-numeric trailing chars
s = &a[..idx];
break;
}
s = a.trim();
// If line is not a number line, return the line as is
s = a;
}
return s;
s
}
// Matches GNU behavior, see:
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
// Specifically *not* the same as sort -n | uniq
fn num_sort_dedup(a: &str) -> &str {
// Empty lines are dumped
if a.is_empty() {
return "0";
// And lines that don't begin numerically are dumped
} else if !a.trim().chars().nth(0).unwrap_or('\0').is_numeric() {
return "0";
} else {
// Prepare lines for comparison of only the numerical leading numbers
return get_leading_number(a);
// This function cleans up the initial comparison done by leading_num_common for a numeric compare.
// GNU sort does its numeric comparison through strnumcmp. However, we don't have or
// may not want to use libc. Instead we emulate the GNU sort numeric compare by ignoring
// those leading number lines GNU sort would not recognize. GNU numeric compare would
// not recognize a positive sign or scientific/E notation so we strip those elements here.
fn get_leading_num(a: &str) -> &str {
let mut s = "";
let b = leading_num_common(a);
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip
for (idx, c) in b.char_indices() {
if c.eq(&'e') || c.eq(&'E') || b.chars().nth(0).unwrap_or('\0').eq(&POSITIVE) {
s = &b[..idx];
break;
}
// If no further processing needed to be done, return the line as-is to be sorted
s = b;
}
// And empty number or non-number lines are to be treated as 0 but only for numeric sort
// All '0'-ed lines will be sorted later, but only amongst themselves, during the so-called 'last resort comparison.'
if s.is_empty() {
s = "0";
};
s
}
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
// scientific notation, so we strip those lines only after the end of the following numeric string.
// For example, 5e10KFD would be 5e10 or 5x10^10 and +10000HFKJFK would become 10000.
fn get_leading_gen(a: &str) -> String {
// Make this iter peekable to see if next char is numeric
let mut p_iter = leading_num_common(a).chars().peekable();
let mut r = String::new();
// Cleanup raw stripped strings
for c in p_iter.to_owned() {
let next_char_numeric = p_iter.peek().unwrap_or(&'\0').is_numeric();
// Only general numeric recognizes e notation and, see block below, the '+' sign
if (c.eq(&'e') && !next_char_numeric)
|| (c.eq(&'E') && !next_char_numeric)
{
r = a.split(c).next().unwrap_or("").to_owned();
break;
// If positive sign and next char is not numeric, split at postive sign at keep trailing numbers
// There is a more elegant way to do this in Rust 1.45, std::str::strip_prefix
} else if c.eq(&POSITIVE) && !next_char_numeric {
let mut v: Vec<&str> = a.split(c).collect();
let x = v.split_off(1);
r = x.join("");
break;
// If no further processing needed to be done, return the line as-is to be sorted
} else {
r = a.to_owned();
}
}
r
}
fn get_months_dedup(a: &str) -> String {
let pattern = if a.trim().len().ge(&3) {
// Split at 3rd char and get first element of tuple ".0"
a.split_at(3).0
} else {
""
};
let month = match pattern.to_uppercase().as_ref() {
"JAN" => Month::January,
"FEB" => Month::February,
"MAR" => Month::March,
"APR" => Month::April,
"MAY" => Month::May,
"JUN" => Month::June,
"JUL" => Month::July,
"AUG" => Month::August,
"SEP" => Month::September,
"OCT" => Month::October,
"NOV" => Month::November,
"DEC" => Month::December,
_ => Month::Unknown,
};
if month == Month::Unknown {
"".to_owned()
} else {
pattern.to_uppercase()
}
}
// *For all dedups/uniques we must compare leading numbers*
// Also note numeric compare and unique output is specifically *not* the same as a "sort | uniq"
// See: https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
fn get_nums_dedup(a: &str) -> &str {
// Trim and remove any leading zeros
let s = a.trim().trim_start_matches('0');
// Get first char
let c = s.chars().nth(0).unwrap_or('\0');
// Empty lines and non-number lines are treated as the same for dedup
if s.is_empty() {
""
} else if !c.eq(&NEGATIVE) && !c.is_numeric() {
""
// Prepare lines for comparison of only the numerical leading numbers
} else {
get_leading_num(s)
}
}
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
#[inline(always)]
fn permissive_f64_parse(a: &str) -> f64 {
// Remove thousands seperators
let a = a.replace(THOUSANDS_SEP, "");
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
match a.parse::<f64>() {
// *Keep this trim before parse* despite what POSIX may say about -b and -n
// because GNU and BSD both seem to require it to match their behavior
match a.trim().parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a,
Err(_) => std::f64::NEG_INFINITY,
}
}
/// Compares two floats, with errors and non-numerics assumed to be -inf.
/// Stops coercing at the first non-numeric char.
fn numeric_compare(a: &str, b: &str) -> Ordering {
#![allow(clippy::comparison_chain)]
let sa = get_leading_number(a);
let sb = get_leading_number(b);
let sa = get_leading_num(a);
let sb = get_leading_num(b);
let fa = permissive_f64_parse(sa);
let fb = permissive_f64_parse(sb);
@ -534,27 +772,17 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
}
}
fn human_numeric_convert(a: &str) -> f64 {
let int_str = get_leading_number(a);
let (_, s) = a.split_at(int_str.len());
let int_part = permissive_f64_parse(int_str);
let suffix: f64 = match s.parse().unwrap_or('\0') {
'K' => 1000f64,
'M' => 1E6,
'G' => 1E9,
'T' => 1E12,
'P' => 1E15,
_ => 1f64,
};
int_part * suffix
}
/// Compare two strings as if they are human readable sizes.
/// AKA 1M > 100k
fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
/// Compares two floats, with errors and non-numerics assumed to be -inf.
/// Stops coercing at the first non-numeric char.
fn general_numeric_compare(a: &str, b: &str) -> Ordering {
#![allow(clippy::comparison_chain)]
let fa = human_numeric_convert(a);
let fb = human_numeric_convert(b);
let sa = get_leading_gen(a);
let sb = get_leading_gen(b);
let fa = permissive_f64_parse(&sa);
let fb = permissive_f64_parse(&sb);
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
if fa > fb {
Ordering::Greater
@ -565,14 +793,46 @@ fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
}
}
fn random_shuffle(a: &str, b: &str, salt: String) -> Ordering {
// GNU/BSD does not handle converting numbers to an equal scale
// properly. GNU/BSD simply recognize that there is a human scale and sorts
// those numbers ahead of other number inputs. There are perhaps limits
// to the type of behavior we should emulate, and this might be such a limit.
// Properly handling these units seems like a value add to me. And when sorting
// these types of numbers, we rarely care about pure performance.
fn human_numeric_convert(a: &str) -> f64 {
let num_str = get_leading_num(a);
let suffix = a.trim_start_matches(num_str);
let num_part = permissive_f64_parse(num_str);
let suffix: f64 = match suffix.parse().unwrap_or('\0') {
// SI Units
'K' => 1E3,
'M' => 1E6,
'G' => 1E9,
'T' => 1E12,
'P' => 1E15,
'E' => 1E18,
'Z' => 1E21,
'Y' => 1E24,
_ => 1f64,
};
num_part * suffix
}
/// Compare two strings as if they are human readable sizes.
/// AKA 1M > 100k
fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
#![allow(clippy::comparison_chain)]
let salt_slice = salt.as_str();
let fa = human_numeric_convert(a);
let fb = human_numeric_convert(b);
let da = hash(&[a, salt_slice].concat());
let db = hash(&[b, salt_slice].concat());
da.cmp(&db)
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
if fa > fb {
Ordering::Greater
} else if fa < fb {
Ordering::Less
} else {
Ordering::Equal
}
}
fn get_rand_string() -> String {
@ -583,12 +843,22 @@ fn get_rand_string() -> String {
.collect::<String>()
}
fn hash<T: Hash>(t: &T) -> u64 {
let mut s: XxHash64 = Default::default();
fn get_hash<T: Hash>(t: &T) -> u64 {
let mut s: FnvHasher = Default::default();
t.hash(&mut s);
s.finish()
}
fn random_shuffle(a: &str, b: &str, x: String) -> Ordering {
#![allow(clippy::comparison_chain)]
let salt_slice = x.as_str();
let da = get_hash(&[a, salt_slice].concat());
let db = get_hash(&[b, salt_slice].concat());
da.cmp(&db)
}
#[derive(Eq, Ord, PartialEq, PartialOrd)]
enum Month {
Unknown,
@ -608,13 +878,15 @@ enum Month {
/// Parse the beginning string into a Month, returning Month::Unknown on errors.
fn month_parse(line: &str) -> Month {
match line
.split_whitespace()
.next()
.unwrap()
.to_uppercase()
.as_ref()
{
// GNU splits at any 3 letter match "JUNNNN" is JUN
let pattern = if line.trim().len().ge(&3) {
// Split a 3 and get first element of tuple ".0"
line.split_at(3).0
} else {
""
};
match pattern.to_uppercase().as_ref() {
"JAN" => Month::January,
"FEB" => Month::February,
"MAR" => Month::March,
@ -632,7 +904,16 @@ fn month_parse(line: &str) -> Month {
}
fn month_compare(a: &str, b: &str) -> Ordering {
month_parse(a).cmp(&month_parse(b))
let ma = month_parse(a);
let mb = month_parse(b);
if ma > mb {
Ordering::Greater
} else if ma < mb {
Ordering::Less
} else {
Ordering::Equal
}
}
fn version_compare(a: &str, b: &str) -> Ordering {
@ -650,19 +931,26 @@ fn version_compare(a: &str, b: &str) -> Ordering {
}
fn remove_nondictionary_chars(s: &str) -> String {
// Using 'is_ascii_whitespace()' instead of 'is_whitespace()', because it
// uses only symbols compatible with UNIX sort (space, tab, newline).
// 'is_whitespace()' uses more symbols as whitespace (e.g. vertical tab).
// According to GNU, dictionary chars are those of ASCII
// and a blank is a space or a tab
s.chars()
.filter(|c| c.is_alphanumeric() || c.is_ascii_whitespace())
.filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace())
.collect::<String>()
}
fn print_sorted<S, T: Iterator<Item = S>>(iter: T, outfile: &Option<String>)
fn remove_nonprinting_chars(s: &str) -> String {
// However, GNU says nonprinting chars are more permissive.
// All of ASCII except control chars ie, escape, newline
s.chars()
.filter(|c| c.is_ascii() && !c.is_ascii_control())
.collect::<String>()
}
fn print_sorted<S, T: Iterator<Item = S>>(iter: T, settings: &Settings)
where
S: std::fmt::Display,
{
let mut file: Box<dyn Write> = match *outfile {
let mut file: Box<dyn Write> = match settings.outfile {
Some(ref filename) => match File::create(Path::new(&filename)) {
Ok(f) => Box::new(BufWriter::new(f)) as Box<dyn Write>,
Err(e) => {
@ -673,9 +961,16 @@ where
None => Box::new(stdout()) as Box<dyn Write>,
};
for line in iter {
let str = format!("{}\n", line);
crash_if_err!(1, file.write_all(str.as_bytes()))
if settings.zero_terminated {
for line in iter {
let str = format!("{}\0", line);
crash_if_err!(1, file.write_all(str.as_bytes()));
}
} else {
for line in iter {
let str = format!("{}\n", line);
crash_if_err!(1, file.write_all(str.as_bytes()));
}
}
}
@ -700,6 +995,22 @@ mod tests {
use super::*;
#[test]
fn test_get_hash() {
let a = "Ted".to_string();
assert_eq!(2646829031758483623, get_hash(&a));
}
#[test]
fn test_random_shuffle() {
let a = "Ted";
let b = "Ted";
let c = get_rand_string();
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
}
#[test]
fn test_default_compare() {
let a = "your own";
@ -746,13 +1057,4 @@ mod tests {
assert_eq!(Ordering::Less, version_compare(a, b));
}
#[test]
fn test_random_compare() {
let a = "9";
let b = "9";
let c = get_rand_string();
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
}
}

View file

@ -1,44 +1,82 @@
use crate::common::util::*;
#[test]
fn test_check_zero_terminated_failure() {
new_ucmd!()
.arg("-z")
.arg("-c")
.arg("zero-terminated.txt")
.fails()
.stdout_is("sort: disorder in line 0\n");
}
#[test]
fn test_check_zero_terminated_success() {
new_ucmd!()
.arg("-z")
.arg("-c")
.arg("zero-terminated.expected")
.succeeds();
}
#[test]
fn test_random_shuffle_len() {
// check whether output is the same length as the input
const FILE: &'static str = "default_unsorted_ints.expected";
let (at, _ucmd) = at_and_ucmd!();
let result = new_ucmd!().arg("-R").arg(FILE).run().stdout;
let expected = at.read(FILE);
assert_ne!(result, expected);
assert_eq!(result.len(), expected.len());
}
#[test]
fn test_random_shuffle_contains_all_lines() {
// check whether lines of input are all in output
const FILE: &'static str = "default_unsorted_ints.expected";
let (at, _ucmd) = at_and_ucmd!();
let result = new_ucmd!().arg("-R").arg(FILE).run().stdout;
let expected = at.read(FILE);
let result_sorted = new_ucmd!().pipe_in(result.clone()).run().stdout;
assert_ne!(result, expected);
assert_eq!(result_sorted, expected);
}
#[test]
fn test_random_shuffle_contains_two_runs_not_the_same() {
// check to verify that two random shuffles are not equal; this has the
// potential to fail in the unlikely event that random order is the same
// as the starting order, or if both random sorts end up having the same order.
const FILE: &'static str = "default_unsorted_ints.expected";
let (at, _ucmd) = at_and_ucmd!();
let result = new_ucmd!().arg("-R").arg(FILE).run().stdout;
let expected = at.read(FILE);
let unexpected = new_ucmd!().arg("-R").arg(FILE).run().stdout;
assert_ne!(result, expected);
assert_ne!(result, unexpected);
}
#[test]
fn test_numeric_floats_and_ints() {
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1\n-8\n1.04\n-1";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8\n-1\n1\n1.04\n1.444\n8.013\n");
}
test_helper("numeric_floats_and_ints", "-n");
}
#[test]
fn test_numeric_floats() {
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.040000000\n1.444\n1.58590\n8.013\n");
}
test_helper("numeric_floats", "-n");
}
#[test]
fn test_numeric_floats_with_nan() {
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n1.0/0.0\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.0/0.0\n1.040000000\n1.444\n1.58590\n");
}
test_helper("numeric_floats_with_nan", "-n");
}
#[test]
fn test_numeric_unfixed_floats() {
test_helper("numeric_fixed_floats", "-n");
test_helper("numeric_unfixed_floats", "-n");
}
#[test]
@ -53,26 +91,12 @@ fn test_numeric_unsorted_ints() {
#[test]
fn test_human_block_sizes() {
for human_numeric_sort_param in vec!["-h", "--human-numeric-sort"] {
let input = "8981K\n909991M\n-8T\n21G\n0.8M";
new_ucmd!()
.arg(human_numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8T\n0.8M\n8981K\n21G\n909991M\n");
}
test_helper("human_block_sizes", "-h");
}
#[test]
fn test_month_default() {
for month_sort_param in vec!["-M", "--month-sort"] {
let input = "JAn\nMAY\n000may\nJun\nFeb";
new_ucmd!()
.arg(month_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("000may\nJAn\nFeb\nMAY\nJun\n");
}
test_helper("month_default", "-M");
}
#[test]
@ -82,23 +106,12 @@ fn test_month_stable() {
#[test]
fn test_default_unsorted_ints() {
let input = "9\n1909888\n000\n1\n2";
new_ucmd!()
.pipe_in(input)
.succeeds()
.stdout_only("000\n1\n1909888\n2\n9\n");
test_helper("default_unsorted_ints", "");
}
#[test]
fn test_numeric_unique_ints() {
for numeric_unique_sort_param in vec!["-nu"] {
let input = "9\n9\n8\n1\n";
new_ucmd!()
.arg(numeric_unique_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("1\n8\n9\n");
}
test_helper("numeric_unsorted_ints_unique", "-nu");
}
#[test]
@ -116,6 +129,148 @@ fn test_dictionary_order() {
test_helper("dictionary_order", "-d");
}
#[test]
fn test_dictionary_order2() {
for non_dictionary_order2_param in vec!["-d"] {
new_ucmd!()
.pipe_in("a👦🏻aa b\naaaa b")
.arg(non_dictionary_order2_param)
.succeeds()
.stdout_only("a👦🏻aa b\naaaa b\n");
}
}
#[test]
fn test_non_printing_chars() {
for non_printing_chars_param in vec!["-i"] {
new_ucmd!()
.pipe_in("a👦🏻aa b\naaaa b")
.arg(non_printing_chars_param)
.succeeds()
.stdout_only("aaaa b\na👦🏻aa b\n");
}
}
#[test]
fn test_exponents_positive_general_fixed() {
for exponents_positive_general_param in vec!["-g"] {
new_ucmd!()
.pipe_in("100E6\n\n50e10\n+100000\n\n10000K78\n10E\n\n\n1000EDKLD\n\n\n100E6\n\n50e10\n+100000\n\n")
.arg(exponents_positive_general_param)
.succeeds()
.stdout_only("\n\n\n\n\n\n\n\n10000K78\n1000EDKLD\n10E\n+100000\n+100000\n100E6\n100E6\n50e10\n50e10\n");
}
}
#[test]
fn test_exponents_positive_numeric() {
test_helper("exponents-positive-numeric", "-n");
}
#[test]
fn test_months_dedup() {
test_helper("months-dedup", "-Mu");
}
#[test]
fn test_mixed_floats_ints_chars_numeric() {
test_helper("mixed_floats_ints_chars_numeric", "-n");
}
#[test]
fn test_mixed_floats_ints_chars_numeric_unique() {
test_helper("mixed_floats_ints_chars_numeric_unique", "-nu");
}
#[test]
fn test_mixed_floats_ints_chars_numeric_reverse() {
test_helper("mixed_floats_ints_chars_numeric_unique_reverse", "-nur");
}
#[test]
fn test_mixed_floats_ints_chars_numeric_stable() {
test_helper("mixed_floats_ints_chars_numeric_stable", "-ns");
}
#[test]
fn test_numeric_floats_and_ints2() {
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1\n-8\n1.04\n-1";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8\n-1\n1\n1.04\n1.444\n8.013\n");
}
}
#[test]
fn test_numeric_floats2() {
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.040000000\n1.444\n1.58590\n8.013\n");
}
}
#[test]
fn test_numeric_floats_with_nan2() {
test_helper("numeric-floats-with-nan2", "-n");
}
#[test]
fn test_human_block_sizes2() {
for human_numeric_sort_param in vec!["-h", "--human-numeric-sort"] {
let input = "8981K\n909991M\n-8T\n21G\n0.8M";
new_ucmd!()
.arg(human_numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8T\n0.8M\n8981K\n21G\n909991M\n");
}
}
#[test]
fn test_month_default2() {
for month_sort_param in vec!["-M", "--month-sort"] {
let input = "JAn\nMAY\n000may\nJun\nFeb";
new_ucmd!()
.arg(month_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("000may\nJAn\nFeb\nMAY\nJun\n");
}
}
#[test]
fn test_default_unsorted_ints2() {
let input = "9\n1909888\n000\n1\n2";
new_ucmd!()
.pipe_in(input)
.succeeds()
.stdout_only("000\n1\n1909888\n2\n9\n");
}
#[test]
fn test_numeric_unique_ints2() {
for numeric_unique_sort_param in vec!["-nu"] {
let input = "9\n9\n8\n1\n";
new_ucmd!()
.arg(numeric_unique_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("1\n8\n9\n");
}
}
#[test]
fn test_zero_terminated() {
test_helper("zero-terminated", "-z");
}
#[test]
fn test_multiple_files() {
new_ucmd!()
@ -192,6 +347,15 @@ fn test_check() {
.stdout_is("");
}
#[test]
fn test_check_silent() {
new_ucmd!()
.arg("-C")
.arg("check_fail.txt")
.fails()
.stdout_is("");
}
fn test_helper(file_name: &str, args: &str) {
new_ucmd!()
.arg(args)

View file

@ -0,0 +1,12 @@
10E
1000EDKLD
10000K78
+100000
100E6
50e10

View file

@ -0,0 +1,12 @@
10000K78
10E
1000EDKLD
100E6
50e10
+100000

View file

@ -0,0 +1,12 @@
+100000
10E
50e10
100E6
1000EDKLD
10000K78

View file

@ -0,0 +1,12 @@
10000K78
10E
1000EDKLD
100E6
50e10
+100000

View file

@ -0,0 +1,37 @@
.2T
2G
100M
7800900K
51887300-
1890777
56908-90078
6780.0009866
6780.000986
789----009999 90-0 90-0
1
0001
apr
MAY
JUNNNN
JAN
AUG
APR
0000000
00
-1.4

View file

@ -0,0 +1,37 @@
JAN
0000000
00
0001
1
-1.4
JUNNNN
AUG
apr
APR
MAY
1890777
56908-90078
51887300-
6780.0009866
789----009999 90-0 90-0
6780.000986
100M
7800900K
2G
.2T

View file

@ -0,0 +1,37 @@
-1.4
JAN
0000000
00
JUNNNN
AUG
apr
APR
MAY
0001
1
789----009999 90-0 90-0
6780.000986
6780.0009866
56908-90078
1890777
51887300-
7800900K
100M
2G
.2T

View file

@ -0,0 +1,37 @@
JAN
0000000
00
0001
1
-1.4
JUNNNN
AUG
apr
APR
MAY
1890777
56908-90078
51887300-
6780.0009866
789----009999 90-0 90-0
6780.000986
100M
7800900K
2G
.2T

View file

@ -0,0 +1,13 @@
-1.4
JAN
0001
789----009999 90-0 90-0
6780.000986
6780.0009866
56908-90078
1890777
51887300-
7800900K
100M
2G
.2T

View file

@ -0,0 +1,37 @@
JAN
0000000
00
0001
1
-1.4
JUNNNN
AUG
apr
APR
MAY
1890777
56908-90078
51887300-
6780.0009866
789----009999 90-0 90-0
6780.000986
100M
7800900K
2G
.2T

View file

@ -0,0 +1,37 @@
-1.4
00
0000000
APR
AUG
JAN
JUNNNN
MAY
apr
0001
1
789----009999 90-0 90-0
6780.000986
6780.0009866
56908-90078
1890777
51887300-
7800900K
100M
2G
.2T

View file

@ -0,0 +1,46 @@
JAN
0000000
00
0001
1
-1.4
JUNNNN
AUG
apr
APR
MAY
1890777
56908-90078
51887300-
6780.0009866
789----009999 90-0 90-0
6780.000986
1M
10M
100M
1000M
10000M
7800900K
780090K
78009K
7800K
780K
2G
.2T

View file

@ -0,0 +1,30 @@
-2028789030
-896689
-8.90880
-1
-.05
000
CARAvan
00000001
1
1.040000000
1.444
1.58590
8.013
45
46.89
4567.
37800
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -0,0 +1,30 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,30 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
00000001
CARAvan
000
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -0,0 +1,30 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
00000001
CARAvan
000
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -0,0 +1,30 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,30 @@
-2028789030
-896689
-8.90880
-1
-.05
CARAvan
000
1
00000001
1.040000000
1.444
1.58590
8.013
45
46.89
4567.
37800
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -0,0 +1,30 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,20 @@
-2028789030
-896689
-8.90880
-1
-.05
1
1.040000000
1.444
1.58590
8.013
45
46.89
4567.
37800
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -0,0 +1,30 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,20 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -0,0 +1,30 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,20 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -0,0 +1,30 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,6 @@
JAN
apr
MAY
JUNNNN
AUG

37
tests/fixtures/sort/months-dedup.txt vendored Normal file
View file

@ -0,0 +1,37 @@
JAN
0000000
00
0001
1
-1.4
JUNNNN
AUG
apr
APR
MAY
1890777
56908-90078
51887300-
6780.0009866
789----009999 90-0 90-0
6780.000986
100M
7800900K
2G
.2T

View file

@ -0,0 +1,23 @@
-8.90880
-.05
Karma
1
1.0/0.0
1.040000000
1.2
1.444
1.58590

View file

@ -0,0 +1,23 @@
Karma
1.0/0.0
-8.90880
-.05
1.040000000
1.444
1.58590
1
1.2

View file

@ -0,0 +1 @@
../..../../by-util../../common../../fixtures../../fixtures/cat../../fixtures/cksum../../fixtures/comm../../fixtures/cp../../fixtures/cp/dir_with_mount../../fixtures/cp/dir_with_mount/copy_me../../fixtures/cp/hello_dir../../fixtures/cp/hello_dir_with_file../../fixtures/csplit../../fixtures/cut../../fixtures/cut/sequences../../fixtures/dircolors../../fixtures/du../../fixtures/du/subdir../../fixtures/du/subdir/deeper../../fixtures/du/subdir/links../../fixtures/env../../fixtures/expand../../fixtures/fmt../../fixtures/fold../../fixtures/hashsum../../fixtures/head../../fixtures/join../../fixtures/mv../../fixtures/nl../../fixtures/numfmt../../fixtures/od../../fixtures/paste../../fixtures/ptx../../fixtures/shuf../../fixtures/sort../../fixtures/sum../../fixtures/tac../../fixtures/tail../../fixtures/tsort../../fixtures/unexpand../../fixtures/uniq../../fixtures/wc

View file

@ -0,0 +1 @@
../../fixtures/paste../../fixtures/du../../fixtures/fold../../fixtures../../fixtures/cp/dir_with_mount/copy_me../../fixtures/sum../../fixtures/expand../../fixtures/mv../../fixtures/shuf../../fixtures/od../../fixtures/env../../fixtures/cut../../fixtures/cp/hello_dir../../fixtures/hashsum../../common../../fixtures/du/subdir/links../../fixtures/dircolors../../fixtures/nl../../fixtures/wc../../fixtures/cut/sequences../../fixtures/numfmt../../fixtures/comm../../fixtures/du/subdir../../fixtures/cp/hello_dir_with_file../../fixtures/ptx../../fixtures/cp/dir_with_mount../../fixtures/cat../../fixtures/cp../..../../fixtures/tail../../fixtures/du/subdir/deeper../../fixtures/head../../fixtures/join../../by-util../../fixtures/csplit../../fixtures/cksum../../fixtures/fmt../../fixtures/tsort../../fixtures/tac../../fixtures/unexpand../../fixtures/uniq../../fixtures/sort