hashsum: improve the file verification algo.

We have 3 different kinds of input:
* "algo (filename) = checksum"
  example: `BLAKE2 (a) = bedfbb90d858c2d67b7ee8f7523be3d3b54004ef9e4f02f2ad79a1d05bfdfe49b81e3c92ebf99b504102b6bf003fa342587f5b3124c205f55204e8c4b4ce7d7c`
* "checksum  filename"
  example: `60b725f10c9c85c70d97880dfe8191b3  a`
* "checksum filename"
  example: `60b725f10c9c85c70d97880dfe8191b3 a`

These algo/regexp are tricky as files can be called "a, " b", " ", or "*c".
We look at the first time to analyze the kind of input and reuse the same regexp then.
This commit is contained in:
Sylvestre Ledru 2024-05-26 01:07:13 +02:00
parent 6acc8e695f
commit 2c83b28d18
2 changed files with 254 additions and 43 deletions

View file

@ -299,6 +299,17 @@ pub fn detect_algo(algo: &str, length: Option<usize>) -> UResult<HashAlgorithm>
}
}
// Regexp to handle the three input formats:
// 1. <algo>[-<bits>] (<filename>) = <checksum>
// algo must be uppercase or b (for blake2b)
// 2. <checksum> [* ]<filename>
// 3. <checksum> [*]<filename> (only one space)
const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>.*)\)\s*=\s*(?P<checksum>[a-fA-F0-9]+)$";
const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>.*)$";
// In this case, we ignore the *
const SINGLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<binary>\*?)(?P<filename>.*)$";
/***
* Do the checksum validation (can be strict or not)
*/
@ -317,12 +328,9 @@ pub fn perform_checksum_validation<'a, I>(
where
I: Iterator<Item = &'a OsStr>,
{
// Regexp to handle the two input formats:
// 1. <algo>[-<bits>] (<filename>) = <checksum>
// algo must be uppercase or b (for blake2b)
// 2. <checksum> [* ]<filename>
let regex_pattern = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename1>.*)\)\s*=\s*(?P<checksum1>[a-fA-F0-9]+)$|^(?P<checksum2>[a-fA-F0-9]+)\s[* ](?P<filename2>.*)";
let re = Regex::new(regex_pattern).unwrap();
let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap();
let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap();
let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap();
// if cksum has several input files, it will print the result for each file
for filename_input in files {
@ -350,30 +358,37 @@ where
}
}
};
let reader = BufReader::new(file);
let mut reader = BufReader::new(file);
let mut first_line = String::new();
reader.read_line(&mut first_line)?;
// Determine which regular expression to use based on the first line
let first_line_trim = first_line.trim();
let (chosen_regex, algo_based_format) = if algo_based_regex.is_match(first_line_trim) {
(&algo_based_regex, true)
} else if double_space_regex.is_match(first_line_trim) {
// If the first line contains a double space, use the double space regex
(&double_space_regex, false)
} else {
// It is probably rare but sometimes the checksum file may contain a single space
(&single_space_regex, false)
};
// Push the first line back to the reader
let first_line_reader = io::Cursor::new(first_line);
let chain_reader = first_line_reader.chain(reader);
let reader = BufReader::new(chain_reader);
// for each line in the input, check if it is a valid checksum line
for (i, line) in reader.lines().enumerate() {
let line = line.unwrap_or_else(|_| String::new());
if let Some(caps) = re.captures(&line) {
if let Some(caps) = chosen_regex.captures(&line) {
properly_formatted = true;
// Determine what kind of file input we had
// we need it for case "--check -a sm3 <file>" when <file> is
// <algo>[-<bits>] (<filename>) = <checksum>
let algo_based_format =
caps.name("filename1").is_some() && caps.name("checksum1").is_some();
let filename_to_check = caps.name("filename").unwrap().as_str();
let filename_to_check = caps
.name("filename1")
.or(caps.name("filename2"))
.unwrap()
.as_str();
let expected_checksum = caps
.name("checksum1")
.or(caps.name("checksum2"))
.unwrap()
.as_str();
let expected_checksum = caps.name("checksum").unwrap().as_str();
// If the algo_name is provided, we use it, otherwise we try to detect it
let (algo_name, length) = if algo_based_format {
@ -432,7 +447,7 @@ where
}
let mut algo = detect_algo(&algo_name, length)?;
let (filename_to_check_unescaped, prefix) = unescape_filename(filename_to_check);
let (filename_to_check_unescaped, prefix) = unescape_filename(&filename_to_check);
// manage the input file
let file_to_check: Box<dyn Read> = if filename_to_check == "-" {
@ -472,7 +487,7 @@ where
// Do the checksum validation
if expected_checksum == calculated_checksum {
if !quiet {
if !quiet && !status {
println!("{prefix}{filename_to_check}: OK");
}
correct_format += 1;
@ -510,15 +525,17 @@ where
// return an error
if !properly_formatted {
let filename = filename_input.to_string_lossy();
show_error!(
"{}: no properly formatted checksum lines found",
if input_is_stdin {
"standard input"
} else {
&filename
}
.maybe_quote()
);
if !status {
show_error!(
"{}: no properly formatted checksum lines found",
if input_is_stdin {
"standard input"
} else {
&filename
}
.maybe_quote()
);
}
set_exit_code(1);
return Ok(());
}
@ -758,4 +775,115 @@ mod tests {
assert!(detect_algo("sha3_512", None).is_err());
}
#[test]
fn test_algo_based_regex() {
let algo_based_regex = Regex::new(ALGO_BASED_REGEX).unwrap();
let test_cases = vec![
("SHA256 (example.txt) = d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", Some(("SHA256", None, "example.txt", "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"))),
("BLAKE2b-512 (file) = abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef", Some(("BLAKE2b", Some("512"), "file", "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef"))),
(" MD5 (test) = 9e107d9d372bb6826bd81d3542a419d6", Some(("MD5", None, "test", "9e107d9d372bb6826bd81d3542a419d6"))),
("SHA-1 (anotherfile) = a9993e364706816aba3e25717850c26c9cd0d89d", Some(("SHA", Some("1"), "anotherfile", "a9993e364706816aba3e25717850c26c9cd0d89d"))),
];
for (input, expected) in test_cases {
let captures = algo_based_regex.captures(input);
match expected {
Some((algo, bits, filename, checksum)) => {
assert!(captures.is_some());
let captures = captures.unwrap();
assert_eq!(captures.name("algo").unwrap().as_str(), algo);
assert_eq!(captures.name("bits").map(|m| m.as_str()), bits);
assert_eq!(captures.name("filename").unwrap().as_str(), filename);
assert_eq!(captures.name("checksum").unwrap().as_str(), checksum);
}
None => {
assert!(captures.is_none());
}
}
}
}
#[test]
fn test_double_space_regex() {
let double_space_regex = Regex::new(DOUBLE_SPACE_REGEX).unwrap();
let test_cases = vec![
(
"60b725f10c9c85c70d97880dfe8191b3 a",
Some(("60b725f10c9c85c70d97880dfe8191b3", "a")),
),
(
"bf35d7536c785cf06730d5a40301eba2 b",
Some(("bf35d7536c785cf06730d5a40301eba2", " b")),
),
(
"f5b61709718c1ecf8db1aea8547d4698 *c",
Some(("f5b61709718c1ecf8db1aea8547d4698", "*c")),
),
(
"b064a020db8018f18ff5ae367d01b212 dd",
Some(("b064a020db8018f18ff5ae367d01b212", "dd")),
),
(
"b064a020db8018f18ff5ae367d01b212 ",
Some(("b064a020db8018f18ff5ae367d01b212", " ")),
),
("invalidchecksum test", None),
];
for (input, expected) in test_cases {
let captures = double_space_regex.captures(input);
match expected {
Some((checksum, filename)) => {
assert!(captures.is_some());
let captures = captures.unwrap();
assert_eq!(captures.name("checksum").unwrap().as_str(), checksum);
assert_eq!(captures.name("filename").unwrap().as_str(), filename);
}
None => {
assert!(captures.is_none());
}
}
}
}
#[test]
fn test_single_space_regex() {
let single_space_regex = Regex::new(SINGLE_SPACE_REGEX).unwrap();
let test_cases = vec![
(
"60b725f10c9c85c70d97880dfe8191b3 a",
Some(("60b725f10c9c85c70d97880dfe8191b3", "a")),
),
(
"bf35d7536c785cf06730d5a40301eba2 b",
Some(("bf35d7536c785cf06730d5a40301eba2", "b")),
),
(
"f5b61709718c1ecf8db1aea8547d4698 *c",
Some(("f5b61709718c1ecf8db1aea8547d4698", "c")),
),
(
"b064a020db8018f18ff5ae367d01b212 dd",
Some(("b064a020db8018f18ff5ae367d01b212", "dd")),
),
("invalidchecksum test", None),
];
for (input, expected) in test_cases {
let captures = single_space_regex.captures(input);
match expected {
Some((checksum, filename)) => {
assert!(captures.is_some());
let captures = captures.unwrap();
assert_eq!(captures.name("checksum").unwrap().as_str(), checksum);
assert_eq!(captures.name("filename").unwrap().as_str(), filename);
}
None => {
assert!(captures.is_none());
}
}
}
}
}

View file

@ -365,26 +365,28 @@ fn test_check_md5sum() {
}
}
// GNU also supports one line sep
#[test]
fn test_check_md5sum_not_enough_space() {
fn test_check_md5sum_only_one_space() {
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
for f in ["a", "b"] {
for f in ["a", " b", "c"] {
at.write(f, &format!("{f}\n"));
}
at.write(
"check.md5sum",
"60b725f10c9c85c70d97880dfe8191b3 a\n\
bf35d7536c785cf06730d5a40301eba2 b\n",
bf35d7536c785cf06730d5a40301eba2 b\n\
2cd6ee2c70b0bde53fbe6cac3c8b8bb1 *c\n",
);
scene
.ccmd("md5sum")
.arg("--strict")
.arg("-c")
.arg("check.md5sum")
.fails()
.stderr_only("md5sum: check.md5sum: no properly formatted checksum lines found\nmd5sum: WARNING: 2 lines are improperly formatted\n");
.ccmd("md5sum")
.arg("--strict")
.arg("-c")
.arg("check.md5sum")
.succeeds()
.stdout_only("a: OK\n b: OK\nc: OK\n");
}
#[test]
@ -684,6 +686,87 @@ fn test_sha1_with_md5sum_should_fail() {
.stderr_does_not_contain("WARNING: 1 line is improperly formatted");
}
#[test]
fn test_check_one_two_space_star() {
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
at.touch("empty");
// with one space, the "*" is removed
at.write("in.md5", "d41d8cd98f00b204e9800998ecf8427e *empty\n");
scene
.ccmd("md5sum")
.arg("--check")
.arg(at.subdir.join("in.md5"))
.succeeds()
.stdout_is("empty: OK\n");
// with two spaces, the "*" is not removed
at.write("in.md5", "d41d8cd98f00b204e9800998ecf8427e *empty\n");
// First should fail as *empty doesn't exit
scene
.ccmd("md5sum")
.arg("--check")
.arg(at.subdir.join("in.md5"))
.fails()
.stdout_is("*empty: FAILED open or read\n");
at.touch("*empty");
// Should pass as we have the file
scene
.ccmd("md5sum")
.arg("--check")
.arg(at.subdir.join("in.md5"))
.succeeds()
.stdout_is("*empty: OK\n");
}
#[test]
fn test_check_one_two_space_star_start_without_star() {
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
at.touch("empty");
at.touch("f");
// with one space, the "*" is removed
at.write(
"in.md5",
"d41d8cd98f00b204e9800998ecf8427e f\nd41d8cd98f00b204e9800998ecf8427e *empty\n",
);
scene
.ccmd("md5sum")
.arg("--check")
.arg(at.subdir.join("in.md5"))
.succeeds()
.stdout_is("f: OK\nempty: OK\n");
// with two spaces, the "*" is not removed
at.write(
"in.md5",
"d41d8cd98f00b204e9800998ecf8427e f\nd41d8cd98f00b204e9800998ecf8427e *empty\n",
);
// First should fail as *empty doesn't exit
scene
.ccmd("md5sum")
.arg("--check")
.arg(at.subdir.join("in.md5"))
.fails()
.stdout_is("f: OK\n*empty: FAILED open or read\n");
at.touch("*empty");
// Should pass as we have the file
scene
.ccmd("md5sum")
.arg("--check")
.arg(at.subdir.join("in.md5"))
.succeeds()
.stdout_is("f: OK\n*empty: OK\n");
}
#[test]
fn test_check_no_backslash_no_space() {
let scene = TestScenario::new(util_name!());