From 156d3f7ee76addb0dd77bfb6872692d3efa43c4d Mon Sep 17 00:00:00 2001 From: Yury Zhytkou <54360928+zhitkoff@users.noreply.github.com> Date: Sun, 10 Mar 2024 17:36:17 -0400 Subject: [PATCH] cut: allow non utf8 characters for delimiters (#6037) --- src/uu/cut/src/cut.rs | 277 +++++++++++++++++------------- tests/by-util/test_cut.rs | 14 ++ tests/fixtures/cut/8bit-delim.txt | 1 + 3 files changed, 175 insertions(+), 117 deletions(-) create mode 100644 tests/fixtures/cut/8bit-delim.txt diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index e89568716..14aa2df96 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -6,9 +6,12 @@ // spell-checker:ignore (ToDO) delim sourcefiles use bstr::io::BufReadExt; -use clap::{crate_version, Arg, ArgAction, Command}; +use clap::{builder::ValueParser, crate_version, Arg, ArgAction, ArgMatches, Command}; +use std::ffi::OsString; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, IsTerminal, Read, Write}; +#[cfg(unix)] +use std::os::unix::ffi::OsStrExt; use std::path::Path; use uucore::display::Quotable; use uucore::error::{set_exit_code, FromIo, UResult, USimpleError}; @@ -26,27 +29,38 @@ const USAGE: &str = help_usage!("cut.md"); const ABOUT: &str = help_about!("cut.md"); const AFTER_HELP: &str = help_section!("after help", "cut.md"); -struct Options { - out_delim: Option, +struct Options<'a> { + out_delimiter: Option<&'a [u8]>, line_ending: LineEnding, + field_opts: Option>, } -enum Delimiter { +enum Delimiter<'a> { Whitespace, - String(String), // FIXME: use char? + Slice(&'a [u8]), } -struct FieldOptions { - delimiter: Delimiter, - out_delimiter: Option, +struct FieldOptions<'a> { + delimiter: Delimiter<'a>, only_delimited: bool, - line_ending: LineEnding, } -enum Mode { - Bytes(Vec, Options), - Characters(Vec, Options), - Fields(Vec, FieldOptions), +enum Mode<'a> { + Bytes(Vec, Options<'a>), + Characters(Vec, Options<'a>), + Fields(Vec, Options<'a>), +} + +impl Default for Delimiter<'_> { + fn default() -> Self { + Self::Slice(b"\t") + } +} + +impl<'a> From<&'a OsString> for Delimiter<'a> { + fn from(s: &'a OsString) -> Self { + Self::Slice(os_string_as_bytes(s).unwrap()) + } } fn stdout_writer() -> Box { @@ -69,11 +83,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> UResult<() let newline_char = opts.line_ending.into(); let mut buf_in = BufReader::new(reader); let mut out = stdout_writer(); - let delim = opts - .out_delim - .as_ref() - .map_or("", String::as_str) - .as_bytes(); + let out_delim = opts.out_delimiter.unwrap_or(b"\t"); let result = buf_in.for_byte_record(newline_char, |line| { let mut print_delim = false; @@ -82,8 +92,8 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> UResult<() break; } if print_delim { - out.write_all(delim)?; - } else if opts.out_delim.is_some() { + out.write_all(out_delim)?; + } else if opts.out_delimiter.is_some() { print_delim = true; } // change `low` from 1-indexed value to 0-index value @@ -109,7 +119,7 @@ fn cut_fields_explicit_out_delim( ranges: &[Range], only_delimited: bool, newline_char: u8, - out_delim: &str, + out_delim: &[u8], ) -> UResult<()> { let mut buf_in = BufReader::new(reader); let mut out = stdout_writer(); @@ -145,7 +155,7 @@ fn cut_fields_explicit_out_delim( for _ in 0..=high - low { // skip printing delimiter if this is the first matching field for this line if print_delim { - out.write_all(out_delim.as_bytes())?; + out.write_all(out_delim)?; } else { print_delim = true; } @@ -256,17 +266,18 @@ fn cut_fields_implicit_out_delim( Ok(()) } -fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { +fn cut_fields(reader: R, ranges: &[Range], opts: &Options) -> UResult<()> { let newline_char = opts.line_ending.into(); - match opts.delimiter { - Delimiter::String(ref delim) => { - let matcher = ExactMatcher::new(delim.as_bytes()); + let field_opts = opts.field_opts.as_ref().unwrap(); // it is safe to unwrap() here - field_opts will always be Some() for cut_fields() call + match field_opts.delimiter { + Delimiter::Slice(delim) => { + let matcher = ExactMatcher::new(delim); match opts.out_delimiter { - Some(ref out_delim) => cut_fields_explicit_out_delim( + Some(out_delim) => cut_fields_explicit_out_delim( reader, &matcher, ranges, - opts.only_delimited, + field_opts.only_delimited, newline_char, out_delim, ), @@ -274,21 +285,20 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes reader, &matcher, ranges, - opts.only_delimited, + field_opts.only_delimited, newline_char, ), } } Delimiter::Whitespace => { let matcher = WhitespaceMatcher {}; - let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t"); cut_fields_explicit_out_delim( reader, &matcher, ranges, - opts.only_delimited, + field_opts.only_delimited, newline_char, - out_delim, + opts.out_delimiter.unwrap_or(b"\t"), ) } } @@ -337,6 +347,88 @@ fn cut_files(mut filenames: Vec, mode: &Mode) { } } +// This is temporary helper function to convert OsString to &[u8] for unix targets only +// TODO Remove this function and re-implement the functionality in each place that calls it +// for all targets using https://doc.rust-lang.org/nightly/std/ffi/struct.OsStr.html#method.as_encoded_bytes +// once project's MSRV is bumped up to 1.74.0+ so that function becomes available +// For now - support unix targets only and on non-unix (i.e. Windows) will just return an error if delimiter value is not UTF-8 +fn os_string_as_bytes(os_string: &OsString) -> UResult<&[u8]> { + #[cfg(unix)] + let bytes = os_string.as_bytes(); + + #[cfg(not(unix))] + let bytes = os_string + .to_str() + .ok_or_else(|| { + uucore::error::UUsageError::new( + 1, + "invalid UTF-8 was detected in one or more arguments", + ) + })? + .as_bytes(); + + Ok(bytes) +} + +// Get delimiter and output delimiter from `-d`/`--delimiter` and `--output-delimiter` options respectively +// Allow either delimiter to have a value that is neither UTF-8 nor ASCII to align with GNU behavior +fn get_delimiters<'a>( + matches: &'a ArgMatches, + delimiter_is_equal: bool, + os_string_equals: &'a OsString, + os_string_nul: &'a OsString, +) -> UResult<(Delimiter<'a>, Option<&'a [u8]>)> { + let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED); + let delim_opt = matches.get_one::(options::DELIMITER); + let delim = match delim_opt { + Some(_) if whitespace_delimited => { + return Err(USimpleError::new( + 1, + "invalid input: Only one of --delimiter (-d) or -w option can be specified", + )); + } + Some(mut os_string) => { + // GNU's `cut` supports `-d=` to set the delimiter to `=`. + // Clap parsing is limited in this situation, see: + // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 + // rewrite the delimiter value os_string before further processing + if delimiter_is_equal { + os_string = os_string_equals; + } else if os_string == "''" || os_string.is_empty() { + // treat `''` as empty delimiter + os_string = os_string_nul; + } + // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters + // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior + let bytes = os_string_as_bytes(os_string)?; + if os_string.to_str().is_some_and(|s| s.chars().count() > 1) + || os_string.to_str().is_none() && bytes.len() > 1 + { + return Err(USimpleError::new( + 1, + "the delimiter must be a single character", + )); + } else { + Delimiter::from(os_string) + } + } + None => match whitespace_delimited { + true => Delimiter::Whitespace, + false => Delimiter::default(), + }, + }; + let out_delim = matches + .get_one::(options::OUTPUT_DELIMITER) + .map(|os_string| { + if os_string.is_empty() || os_string == "''" { + "\0".as_bytes() + } else { + os_string_as_bytes(os_string).unwrap() + } + }); + Ok((delim, out_delim)) +} + mod options { pub const BYTES: &str = "bytes"; pub const CHARACTERS: &str = "characters"; @@ -352,12 +444,26 @@ mod options { #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let args = args.collect_ignore(); + let args = args.collect::>(); - let delimiter_is_equal = args.contains(&"-d=".to_string()); // special case + let delimiter_is_equal = args.contains(&OsString::from("-d=")); // special case let matches = uu_app().try_get_matches_from(args)?; let complement = matches.get_flag(options::COMPLEMENT); + let only_delimited = matches.get_flag(options::ONLY_DELIMITED); + + // since OsString::from creates a new value and it does not by default have 'static lifetime like &str + // we need to create these values here and pass them down to avoid issues with borrow checker and temporary values + let os_string_equals = OsString::from("="); + let os_string_nul = OsString::from("\0"); + + let (delimiter, out_delimiter) = get_delimiters( + &matches, + delimiter_is_equal, + &os_string_equals, + &os_string_nul, + )?; + let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)); // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`, // is expected. The number of those arguments is used for parsing a cutting @@ -381,14 +487,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Mode::Bytes( ranges, Options { - out_delim: Some( - matches - .get_one::(options::OUTPUT_DELIMITER) - .map(|s| s.as_str()) - .unwrap_or_default() - .to_owned(), - ), - line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)), + out_delimiter, + line_ending, + field_opts: None, }, ) }), @@ -396,84 +497,24 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Mode::Characters( ranges, Options { - out_delim: Some( - matches - .get_one::(options::OUTPUT_DELIMITER) - .map(|s| s.as_str()) - .unwrap_or_default() - .to_owned(), - ), - line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)), + out_delimiter, + line_ending, + field_opts: None, }, ) }), - (1, None, None, Some(field_ranges)) => { - list_to_ranges(field_ranges, complement).and_then(|ranges| { - let out_delim = match matches.get_one::(options::OUTPUT_DELIMITER) { - Some(s) => { - if s.is_empty() { - Some("\0".to_owned()) - } else { - Some(s.clone()) - } - } - None => None, - }; - - let only_delimited = matches.get_flag(options::ONLY_DELIMITED); - let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED); - let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); - let line_ending = LineEnding::from_zero_flag(zero_terminated); - - match matches.get_one::(options::DELIMITER).map(|s| s.as_str()) { - Some(_) if whitespace_delimited => { - Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into()) - } - Some(mut delim) => { - // GNU's `cut` supports `-d=` to set the delimiter to `=`. - // Clap parsing is limited in this situation, see: - // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 - if delimiter_is_equal { - delim = "="; - } else if delim == "''" { - // treat `''` as empty delimiter - delim = ""; - } - if delim.chars().count() > 1 { - Err("the delimiter must be a single character".into()) - } else { - let delim = if delim.is_empty() { - "\0".to_owned() - } else { - delim.to_owned() - }; - - Ok(Mode::Fields( - ranges, - FieldOptions { - delimiter: Delimiter::String(delim), - out_delimiter: out_delim, - only_delimited, - line_ending, - }, - )) - } - } - None => Ok(Mode::Fields( - ranges, - FieldOptions { - delimiter: match whitespace_delimited { - true => Delimiter::Whitespace, - false => Delimiter::String("\t".to_owned()), - }, - out_delimiter: out_delim, - only_delimited, - line_ending, - }, - )), - } - }) - } + (1, None, None, Some(field_ranges)) => list_to_ranges(field_ranges, complement).map(|ranges| { + Mode::Fields( + ranges, + Options { + out_delimiter, + line_ending, + field_opts: Some(FieldOptions { + only_delimited, + delimiter, + })}, + ) + }), (2.., _, _, _) => Err( "invalid usage: expects no more than one of --fields (-f), --chars (-c) or --bytes (-b)".into() ), @@ -554,6 +595,7 @@ pub fn uu_app() -> Command { Arg::new(options::DELIMITER) .short('d') .long(options::DELIMITER) + .value_parser(ValueParser::os_string()) .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.") .value_name("DELIM"), ) @@ -596,6 +638,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(options::OUTPUT_DELIMITER) .long(options::OUTPUT_DELIMITER) + .value_parser(ValueParser::os_string()) .help("in field mode, replace the delimiter in output lines with this option's argument") .value_name("NEW_DELIM"), ) diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index 2473ead19..50d158f96 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -288,3 +288,17 @@ fn test_multiple_mode_args() { .stderr_is("cut: invalid usage: expects no more than one of --fields (-f), --chars (-c) or --bytes (-b)\n"); } } + +#[test] +#[cfg(unix)] +fn test_8bit_non_utf8_delimiter() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + let delim = OsStr::from_bytes(b"\xAD".as_slice()); + new_ucmd!() + .arg("-d") + .arg(delim) + .args(&["--out=_", "-f2,3", "8bit-delim.txt"]) + .succeeds() + .stdout_check(|out| out == "b_c\n".as_bytes()); +} diff --git a/tests/fixtures/cut/8bit-delim.txt b/tests/fixtures/cut/8bit-delim.txt new file mode 100644 index 000000000..2312c916a --- /dev/null +++ b/tests/fixtures/cut/8bit-delim.txt @@ -0,0 +1 @@ +a­b­c