From 2c83b28d1876a22ca2f7051e8229c6929f8a3ab5 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 26 May 2024 01:07:13 +0200 Subject: [PATCH] hashsum: improve the file verification algo. We have 3 different kinds of input: * "algo (filename) = checksum" example: `BLAKE2 (a) = bedfbb90d858c2d67b7ee8f7523be3d3b54004ef9e4f02f2ad79a1d05bfdfe49b81e3c92ebf99b504102b6bf003fa342587f5b3124c205f55204e8c4b4ce7d7c` * "checksum filename" example: `60b725f10c9c85c70d97880dfe8191b3 a` * "checksum filename" example: `60b725f10c9c85c70d97880dfe8191b3 a` These algo/regexp are tricky as files can be called "a, " b", " ", or "*c". We look at the first time to analyze the kind of input and reuse the same regexp then. --- src/uucore/src/lib/features/checksum.rs | 196 ++++++++++++++++++++---- tests/by-util/test_hashsum.rs | 101 ++++++++++-- 2 files changed, 254 insertions(+), 43 deletions(-) diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index b13b39e49..88841e9a2 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -299,6 +299,17 @@ pub fn detect_algo(algo: &str, length: Option) -> UResult } } +// Regexp to handle the three input formats: +// 1. [-] () = +// algo must be uppercase or b (for blake2b) +// 2. [* ] +// 3. [*] (only one space) +const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P.*)\)\s*=\s*(?P[a-fA-F0-9]+)$"; +const DOUBLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s{2}(?P.*)$"; + +// In this case, we ignore the * +const SINGLE_SPACE_REGEX: &str = r"^(?P[a-fA-F0-9]+)\s(?P\*?)(?P.*)$"; + /*** * Do the checksum validation (can be strict or not) */ @@ -317,12 +328,9 @@ pub fn perform_checksum_validation<'a, I>( where I: Iterator, { - // Regexp to handle the two input formats: - // 1. [-] () = - // algo must be uppercase or b (for blake2b) - // 2. [* ] - let regex_pattern = r"^\s*\\?(?P(?:[A-Z0-9]+|BLAKE2b))(?:-(?P\d+))?\s?\((?P.*)\)\s*=\s*(?P[a-fA-F0-9]+)$|^(?P[a-fA-F0-9]+)\s[* ](?P.*)"; - let re = Regex::new(regex_pattern).unwrap(); + let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap(); + let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap(); + let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap(); // if cksum has several input files, it will print the result for each file for filename_input in files { @@ -350,30 +358,37 @@ where } } }; - let reader = BufReader::new(file); + let mut reader = BufReader::new(file); + + let mut first_line = String::new(); + reader.read_line(&mut first_line)?; + + // Determine which regular expression to use based on the first line + let first_line_trim = first_line.trim(); + let (chosen_regex, algo_based_format) = if algo_based_regex.is_match(first_line_trim) { + (&algo_based_regex, true) + } else if double_space_regex.is_match(first_line_trim) { + // If the first line contains a double space, use the double space regex + (&double_space_regex, false) + } else { + // It is probably rare but sometimes the checksum file may contain a single space + (&single_space_regex, false) + }; + + // Push the first line back to the reader + let first_line_reader = io::Cursor::new(first_line); + let chain_reader = first_line_reader.chain(reader); + let reader = BufReader::new(chain_reader); // for each line in the input, check if it is a valid checksum line for (i, line) in reader.lines().enumerate() { let line = line.unwrap_or_else(|_| String::new()); - if let Some(caps) = re.captures(&line) { + if let Some(caps) = chosen_regex.captures(&line) { properly_formatted = true; - // Determine what kind of file input we had - // we need it for case "--check -a sm3 " when is - // [-] () = - let algo_based_format = - caps.name("filename1").is_some() && caps.name("checksum1").is_some(); + let filename_to_check = caps.name("filename").unwrap().as_str(); - let filename_to_check = caps - .name("filename1") - .or(caps.name("filename2")) - .unwrap() - .as_str(); - let expected_checksum = caps - .name("checksum1") - .or(caps.name("checksum2")) - .unwrap() - .as_str(); + let expected_checksum = caps.name("checksum").unwrap().as_str(); // If the algo_name is provided, we use it, otherwise we try to detect it let (algo_name, length) = if algo_based_format { @@ -432,7 +447,7 @@ where } let mut algo = detect_algo(&algo_name, length)?; - let (filename_to_check_unescaped, prefix) = unescape_filename(filename_to_check); + let (filename_to_check_unescaped, prefix) = unescape_filename(&filename_to_check); // manage the input file let file_to_check: Box = if filename_to_check == "-" { @@ -472,7 +487,7 @@ where // Do the checksum validation if expected_checksum == calculated_checksum { - if !quiet { + if !quiet && !status { println!("{prefix}{filename_to_check}: OK"); } correct_format += 1; @@ -510,15 +525,17 @@ where // return an error if !properly_formatted { let filename = filename_input.to_string_lossy(); - show_error!( - "{}: no properly formatted checksum lines found", - if input_is_stdin { - "standard input" - } else { - &filename - } - .maybe_quote() - ); + if !status { + show_error!( + "{}: no properly formatted checksum lines found", + if input_is_stdin { + "standard input" + } else { + &filename + } + .maybe_quote() + ); + } set_exit_code(1); return Ok(()); } @@ -758,4 +775,115 @@ mod tests { assert!(detect_algo("sha3_512", None).is_err()); } + + #[test] + fn test_algo_based_regex() { + let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap(); + let test_cases = vec![ + ("SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some(("SHA256", None, "example.txt", "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))), + ("BLAKE2b-512 (file) = abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef", Some(("BLAKE2b", Some("512"), "file", "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef"))), + (" MD5 (test) = 9e107d9d372bb6826bd81d3542a419d6", Some(("MD5", None, "test", "9e107d9d372bb6826bd81d3542a419d6"))), + ("SHA-1 (anotherfile) = a9993e364706816aba3e25717850c26c9cd0d89d", Some(("SHA", Some("1"), "anotherfile", "a9993e364706816aba3e25717850c26c9cd0d89d"))), + ]; + + for (input, expected) in test_cases { + let captures = algo_based_regex.captures(input); + match expected { + Some((algo, bits, filename, checksum)) => { + assert!(captures.is_some()); + let captures = captures.unwrap(); + assert_eq!(captures.name("algo").unwrap().as_str(), algo); + assert_eq!(captures.name("bits").map(|m| m.as_str()), bits); + assert_eq!(captures.name("filename").unwrap().as_str(), filename); + assert_eq!(captures.name("checksum").unwrap().as_str(), checksum); + } + None => { + assert!(captures.is_none()); + } + } + } + } + + #[test] + fn test_double_space_regex() { + let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap(); + + let test_cases = vec![ + ( + "60b725f10c9c85c70d97880dfe8191b3 a", + Some(("60b725f10c9c85c70d97880dfe8191b3", "a")), + ), + ( + "bf35d7536c785cf06730d5a40301eba2 b", + Some(("bf35d7536c785cf06730d5a40301eba2", " b")), + ), + ( + "f5b61709718c1ecf8db1aea8547d4698 *c", + Some(("f5b61709718c1ecf8db1aea8547d4698", "*c")), + ), + ( + "b064a020db8018f18ff5ae367d01b212 dd", + Some(("b064a020db8018f18ff5ae367d01b212", "dd")), + ), + ( + "b064a020db8018f18ff5ae367d01b212 ", + Some(("b064a020db8018f18ff5ae367d01b212", " ")), + ), + ("invalidchecksum test", None), + ]; + + for (input, expected) in test_cases { + let captures = double_space_regex.captures(input); + match expected { + Some((checksum, filename)) => { + assert!(captures.is_some()); + let captures = captures.unwrap(); + assert_eq!(captures.name("checksum").unwrap().as_str(), checksum); + assert_eq!(captures.name("filename").unwrap().as_str(), filename); + } + None => { + assert!(captures.is_none()); + } + } + } + } + + #[test] + fn test_single_space_regex() { + let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap(); + let test_cases = vec![ + ( + "60b725f10c9c85c70d97880dfe8191b3 a", + Some(("60b725f10c9c85c70d97880dfe8191b3", "a")), + ), + ( + "bf35d7536c785cf06730d5a40301eba2 b", + Some(("bf35d7536c785cf06730d5a40301eba2", "b")), + ), + ( + "f5b61709718c1ecf8db1aea8547d4698 *c", + Some(("f5b61709718c1ecf8db1aea8547d4698", "c")), + ), + ( + "b064a020db8018f18ff5ae367d01b212 dd", + Some(("b064a020db8018f18ff5ae367d01b212", "dd")), + ), + ("invalidchecksum test", None), + ]; + + for (input, expected) in test_cases { + let captures = single_space_regex.captures(input); + match expected { + Some((checksum, filename)) => { + assert!(captures.is_some()); + let captures = captures.unwrap(); + assert_eq!(captures.name("checksum").unwrap().as_str(), checksum); + assert_eq!(captures.name("filename").unwrap().as_str(), filename); + } + None => { + assert!(captures.is_none()); + } + } + } + } } diff --git a/tests/by-util/test_hashsum.rs b/tests/by-util/test_hashsum.rs index 419ed43d8..c3c75b09a 100644 --- a/tests/by-util/test_hashsum.rs +++ b/tests/by-util/test_hashsum.rs @@ -365,26 +365,28 @@ fn test_check_md5sum() { } } +// GNU also supports one line sep #[test] -fn test_check_md5sum_not_enough_space() { +fn test_check_md5sum_only_one_space() { let scene = TestScenario::new(util_name!()); let at = &scene.fixtures; - for f in ["a", "b"] { + for f in ["a", " b", "c"] { at.write(f, &format!("{f}\n")); } at.write( "check.md5sum", "60b725f10c9c85c70d97880dfe8191b3 a\n\ - bf35d7536c785cf06730d5a40301eba2 b\n", + bf35d7536c785cf06730d5a40301eba2 b\n\ + 2cd6ee2c70b0bde53fbe6cac3c8b8bb1 *c\n", ); scene - .ccmd("md5sum") - .arg("--strict") - .arg("-c") - .arg("check.md5sum") - .fails() - .stderr_only("md5sum: check.md5sum: no properly formatted checksum lines found\nmd5sum: WARNING: 2 lines are improperly formatted\n"); + .ccmd("md5sum") + .arg("--strict") + .arg("-c") + .arg("check.md5sum") + .succeeds() + .stdout_only("a: OK\n b: OK\nc: OK\n"); } #[test] @@ -684,6 +686,87 @@ fn test_sha1_with_md5sum_should_fail() { .stderr_does_not_contain("WARNING: 1 line is improperly formatted"); } +#[test] +fn test_check_one_two_space_star() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + + at.touch("empty"); + + // with one space, the "*" is removed + at.write("in.md5", "d41d8cd98f00b204e9800998ecf8427e *empty\n"); + + scene + .ccmd("md5sum") + .arg("--check") + .arg(at.subdir.join("in.md5")) + .succeeds() + .stdout_is("empty: OK\n"); + + // with two spaces, the "*" is not removed + at.write("in.md5", "d41d8cd98f00b204e9800998ecf8427e *empty\n"); + // First should fail as *empty doesn't exit + scene + .ccmd("md5sum") + .arg("--check") + .arg(at.subdir.join("in.md5")) + .fails() + .stdout_is("*empty: FAILED open or read\n"); + + at.touch("*empty"); + // Should pass as we have the file + scene + .ccmd("md5sum") + .arg("--check") + .arg(at.subdir.join("in.md5")) + .succeeds() + .stdout_is("*empty: OK\n"); +} + +#[test] +fn test_check_one_two_space_star_start_without_star() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + + at.touch("empty"); + at.touch("f"); + + // with one space, the "*" is removed + at.write( + "in.md5", + "d41d8cd98f00b204e9800998ecf8427e f\nd41d8cd98f00b204e9800998ecf8427e *empty\n", + ); + + scene + .ccmd("md5sum") + .arg("--check") + .arg(at.subdir.join("in.md5")) + .succeeds() + .stdout_is("f: OK\nempty: OK\n"); + + // with two spaces, the "*" is not removed + at.write( + "in.md5", + "d41d8cd98f00b204e9800998ecf8427e f\nd41d8cd98f00b204e9800998ecf8427e *empty\n", + ); + // First should fail as *empty doesn't exit + scene + .ccmd("md5sum") + .arg("--check") + .arg(at.subdir.join("in.md5")) + .fails() + .stdout_is("f: OK\n*empty: FAILED open or read\n"); + + at.touch("*empty"); + // Should pass as we have the file + scene + .ccmd("md5sum") + .arg("--check") + .arg(at.subdir.join("in.md5")) + .succeeds() + .stdout_is("f: OK\n*empty: OK\n"); +} + #[test] fn test_check_no_backslash_no_space() { let scene = TestScenario::new(util_name!());