mirror of
https://github.com/uutils/coreutils
synced 2024-10-07 00:19:14 +00:00
cut: allow non utf8 characters for delimiters (#6037)
This commit is contained in:
parent
da9682da7a
commit
156d3f7ee7
|
@ -6,9 +6,12 @@
|
|||
// spell-checker:ignore (ToDO) delim sourcefiles
|
||||
|
||||
use bstr::io::BufReadExt;
|
||||
use clap::{crate_version, Arg, ArgAction, Command};
|
||||
use clap::{builder::ValueParser, crate_version, Arg, ArgAction, ArgMatches, Command};
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::io::{stdin, stdout, BufReader, BufWriter, IsTerminal, Read, Write};
|
||||
#[cfg(unix)]
|
||||
use std::os::unix::ffi::OsStrExt;
|
||||
use std::path::Path;
|
||||
use uucore::display::Quotable;
|
||||
use uucore::error::{set_exit_code, FromIo, UResult, USimpleError};
|
||||
|
@ -26,27 +29,38 @@ const USAGE: &str = help_usage!("cut.md");
|
|||
const ABOUT: &str = help_about!("cut.md");
|
||||
const AFTER_HELP: &str = help_section!("after help", "cut.md");
|
||||
|
||||
struct Options {
|
||||
out_delim: Option<String>,
|
||||
struct Options<'a> {
|
||||
out_delimiter: Option<&'a [u8]>,
|
||||
line_ending: LineEnding,
|
||||
field_opts: Option<FieldOptions<'a>>,
|
||||
}
|
||||
|
||||
enum Delimiter {
|
||||
enum Delimiter<'a> {
|
||||
Whitespace,
|
||||
String(String), // FIXME: use char?
|
||||
Slice(&'a [u8]),
|
||||
}
|
||||
|
||||
struct FieldOptions {
|
||||
delimiter: Delimiter,
|
||||
out_delimiter: Option<String>,
|
||||
struct FieldOptions<'a> {
|
||||
delimiter: Delimiter<'a>,
|
||||
only_delimited: bool,
|
||||
line_ending: LineEnding,
|
||||
}
|
||||
|
||||
enum Mode {
|
||||
Bytes(Vec<Range>, Options),
|
||||
Characters(Vec<Range>, Options),
|
||||
Fields(Vec<Range>, FieldOptions),
|
||||
enum Mode<'a> {
|
||||
Bytes(Vec<Range>, Options<'a>),
|
||||
Characters(Vec<Range>, Options<'a>),
|
||||
Fields(Vec<Range>, Options<'a>),
|
||||
}
|
||||
|
||||
impl Default for Delimiter<'_> {
|
||||
fn default() -> Self {
|
||||
Self::Slice(b"\t")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a OsString> for Delimiter<'a> {
|
||||
fn from(s: &'a OsString) -> Self {
|
||||
Self::Slice(os_string_as_bytes(s).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn stdout_writer() -> Box<dyn Write> {
|
||||
|
@ -69,11 +83,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
|
|||
let newline_char = opts.line_ending.into();
|
||||
let mut buf_in = BufReader::new(reader);
|
||||
let mut out = stdout_writer();
|
||||
let delim = opts
|
||||
.out_delim
|
||||
.as_ref()
|
||||
.map_or("", String::as_str)
|
||||
.as_bytes();
|
||||
let out_delim = opts.out_delimiter.unwrap_or(b"\t");
|
||||
|
||||
let result = buf_in.for_byte_record(newline_char, |line| {
|
||||
let mut print_delim = false;
|
||||
|
@ -82,8 +92,8 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
|
|||
break;
|
||||
}
|
||||
if print_delim {
|
||||
out.write_all(delim)?;
|
||||
} else if opts.out_delim.is_some() {
|
||||
out.write_all(out_delim)?;
|
||||
} else if opts.out_delimiter.is_some() {
|
||||
print_delim = true;
|
||||
}
|
||||
// change `low` from 1-indexed value to 0-index value
|
||||
|
@ -109,7 +119,7 @@ fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
|
|||
ranges: &[Range],
|
||||
only_delimited: bool,
|
||||
newline_char: u8,
|
||||
out_delim: &str,
|
||||
out_delim: &[u8],
|
||||
) -> UResult<()> {
|
||||
let mut buf_in = BufReader::new(reader);
|
||||
let mut out = stdout_writer();
|
||||
|
@ -145,7 +155,7 @@ fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
|
|||
for _ in 0..=high - low {
|
||||
// skip printing delimiter if this is the first matching field for this line
|
||||
if print_delim {
|
||||
out.write_all(out_delim.as_bytes())?;
|
||||
out.write_all(out_delim)?;
|
||||
} else {
|
||||
print_delim = true;
|
||||
}
|
||||
|
@ -256,17 +266,18 @@ fn cut_fields_implicit_out_delim<R: Read, M: Matcher>(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()> {
|
||||
let newline_char = opts.line_ending.into();
|
||||
match opts.delimiter {
|
||||
Delimiter::String(ref delim) => {
|
||||
let matcher = ExactMatcher::new(delim.as_bytes());
|
||||
let field_opts = opts.field_opts.as_ref().unwrap(); // it is safe to unwrap() here - field_opts will always be Some() for cut_fields() call
|
||||
match field_opts.delimiter {
|
||||
Delimiter::Slice(delim) => {
|
||||
let matcher = ExactMatcher::new(delim);
|
||||
match opts.out_delimiter {
|
||||
Some(ref out_delim) => cut_fields_explicit_out_delim(
|
||||
Some(out_delim) => cut_fields_explicit_out_delim(
|
||||
reader,
|
||||
&matcher,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
field_opts.only_delimited,
|
||||
newline_char,
|
||||
out_delim,
|
||||
),
|
||||
|
@ -274,21 +285,20 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
|||
reader,
|
||||
&matcher,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
field_opts.only_delimited,
|
||||
newline_char,
|
||||
),
|
||||
}
|
||||
}
|
||||
Delimiter::Whitespace => {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t");
|
||||
cut_fields_explicit_out_delim(
|
||||
reader,
|
||||
&matcher,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
field_opts.only_delimited,
|
||||
newline_char,
|
||||
out_delim,
|
||||
opts.out_delimiter.unwrap_or(b"\t"),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -337,6 +347,88 @@ fn cut_files(mut filenames: Vec<String>, mode: &Mode) {
|
|||
}
|
||||
}
|
||||
|
||||
// This is temporary helper function to convert OsString to &[u8] for unix targets only
|
||||
// TODO Remove this function and re-implement the functionality in each place that calls it
|
||||
// for all targets using https://doc.rust-lang.org/nightly/std/ffi/struct.OsStr.html#method.as_encoded_bytes
|
||||
// once project's MSRV is bumped up to 1.74.0+ so that function becomes available
|
||||
// For now - support unix targets only and on non-unix (i.e. Windows) will just return an error if delimiter value is not UTF-8
|
||||
fn os_string_as_bytes(os_string: &OsString) -> UResult<&[u8]> {
|
||||
#[cfg(unix)]
|
||||
let bytes = os_string.as_bytes();
|
||||
|
||||
#[cfg(not(unix))]
|
||||
let bytes = os_string
|
||||
.to_str()
|
||||
.ok_or_else(|| {
|
||||
uucore::error::UUsageError::new(
|
||||
1,
|
||||
"invalid UTF-8 was detected in one or more arguments",
|
||||
)
|
||||
})?
|
||||
.as_bytes();
|
||||
|
||||
Ok(bytes)
|
||||
}
|
||||
|
||||
// Get delimiter and output delimiter from `-d`/`--delimiter` and `--output-delimiter` options respectively
|
||||
// Allow either delimiter to have a value that is neither UTF-8 nor ASCII to align with GNU behavior
|
||||
fn get_delimiters<'a>(
|
||||
matches: &'a ArgMatches,
|
||||
delimiter_is_equal: bool,
|
||||
os_string_equals: &'a OsString,
|
||||
os_string_nul: &'a OsString,
|
||||
) -> UResult<(Delimiter<'a>, Option<&'a [u8]>)> {
|
||||
let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
|
||||
let delim_opt = matches.get_one::<OsString>(options::DELIMITER);
|
||||
let delim = match delim_opt {
|
||||
Some(_) if whitespace_delimited => {
|
||||
return Err(USimpleError::new(
|
||||
1,
|
||||
"invalid input: Only one of --delimiter (-d) or -w option can be specified",
|
||||
));
|
||||
}
|
||||
Some(mut os_string) => {
|
||||
// GNU's `cut` supports `-d=` to set the delimiter to `=`.
|
||||
// Clap parsing is limited in this situation, see:
|
||||
// https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242
|
||||
// rewrite the delimiter value os_string before further processing
|
||||
if delimiter_is_equal {
|
||||
os_string = os_string_equals;
|
||||
} else if os_string == "''" || os_string.is_empty() {
|
||||
// treat `''` as empty delimiter
|
||||
os_string = os_string_nul;
|
||||
}
|
||||
// For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters
|
||||
// and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior
|
||||
let bytes = os_string_as_bytes(os_string)?;
|
||||
if os_string.to_str().is_some_and(|s| s.chars().count() > 1)
|
||||
|| os_string.to_str().is_none() && bytes.len() > 1
|
||||
{
|
||||
return Err(USimpleError::new(
|
||||
1,
|
||||
"the delimiter must be a single character",
|
||||
));
|
||||
} else {
|
||||
Delimiter::from(os_string)
|
||||
}
|
||||
}
|
||||
None => match whitespace_delimited {
|
||||
true => Delimiter::Whitespace,
|
||||
false => Delimiter::default(),
|
||||
},
|
||||
};
|
||||
let out_delim = matches
|
||||
.get_one::<OsString>(options::OUTPUT_DELIMITER)
|
||||
.map(|os_string| {
|
||||
if os_string.is_empty() || os_string == "''" {
|
||||
"\0".as_bytes()
|
||||
} else {
|
||||
os_string_as_bytes(os_string).unwrap()
|
||||
}
|
||||
});
|
||||
Ok((delim, out_delim))
|
||||
}
|
||||
|
||||
mod options {
|
||||
pub const BYTES: &str = "bytes";
|
||||
pub const CHARACTERS: &str = "characters";
|
||||
|
@ -352,12 +444,26 @@ mod options {
|
|||
|
||||
#[uucore::main]
|
||||
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||
let args = args.collect_ignore();
|
||||
let args = args.collect::<Vec<OsString>>();
|
||||
|
||||
let delimiter_is_equal = args.contains(&"-d=".to_string()); // special case
|
||||
let delimiter_is_equal = args.contains(&OsString::from("-d=")); // special case
|
||||
let matches = uu_app().try_get_matches_from(args)?;
|
||||
|
||||
let complement = matches.get_flag(options::COMPLEMENT);
|
||||
let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
|
||||
|
||||
// since OsString::from creates a new value and it does not by default have 'static lifetime like &str
|
||||
// we need to create these values here and pass them down to avoid issues with borrow checker and temporary values
|
||||
let os_string_equals = OsString::from("=");
|
||||
let os_string_nul = OsString::from("\0");
|
||||
|
||||
let (delimiter, out_delimiter) = get_delimiters(
|
||||
&matches,
|
||||
delimiter_is_equal,
|
||||
&os_string_equals,
|
||||
&os_string_nul,
|
||||
)?;
|
||||
let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
|
||||
|
||||
// Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`,
|
||||
// is expected. The number of those arguments is used for parsing a cutting
|
||||
|
@ -381,14 +487,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
|||
Mode::Bytes(
|
||||
ranges,
|
||||
Options {
|
||||
out_delim: Some(
|
||||
matches
|
||||
.get_one::<String>(options::OUTPUT_DELIMITER)
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or_default()
|
||||
.to_owned(),
|
||||
),
|
||||
line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)),
|
||||
out_delimiter,
|
||||
line_ending,
|
||||
field_opts: None,
|
||||
},
|
||||
)
|
||||
}),
|
||||
|
@ -396,84 +497,24 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
|||
Mode::Characters(
|
||||
ranges,
|
||||
Options {
|
||||
out_delim: Some(
|
||||
matches
|
||||
.get_one::<String>(options::OUTPUT_DELIMITER)
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or_default()
|
||||
.to_owned(),
|
||||
),
|
||||
line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)),
|
||||
out_delimiter,
|
||||
line_ending,
|
||||
field_opts: None,
|
||||
},
|
||||
)
|
||||
}),
|
||||
(1, None, None, Some(field_ranges)) => {
|
||||
list_to_ranges(field_ranges, complement).and_then(|ranges| {
|
||||
let out_delim = match matches.get_one::<String>(options::OUTPUT_DELIMITER) {
|
||||
Some(s) => {
|
||||
if s.is_empty() {
|
||||
Some("\0".to_owned())
|
||||
} else {
|
||||
Some(s.clone())
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
|
||||
let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
|
||||
let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);
|
||||
let line_ending = LineEnding::from_zero_flag(zero_terminated);
|
||||
|
||||
match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
|
||||
Some(_) if whitespace_delimited => {
|
||||
Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into())
|
||||
}
|
||||
Some(mut delim) => {
|
||||
// GNU's `cut` supports `-d=` to set the delimiter to `=`.
|
||||
// Clap parsing is limited in this situation, see:
|
||||
// https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242
|
||||
if delimiter_is_equal {
|
||||
delim = "=";
|
||||
} else if delim == "''" {
|
||||
// treat `''` as empty delimiter
|
||||
delim = "";
|
||||
}
|
||||
if delim.chars().count() > 1 {
|
||||
Err("the delimiter must be a single character".into())
|
||||
} else {
|
||||
let delim = if delim.is_empty() {
|
||||
"\0".to_owned()
|
||||
} else {
|
||||
delim.to_owned()
|
||||
};
|
||||
|
||||
Ok(Mode::Fields(
|
||||
ranges,
|
||||
FieldOptions {
|
||||
delimiter: Delimiter::String(delim),
|
||||
out_delimiter: out_delim,
|
||||
only_delimited,
|
||||
line_ending,
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
None => Ok(Mode::Fields(
|
||||
ranges,
|
||||
FieldOptions {
|
||||
delimiter: match whitespace_delimited {
|
||||
true => Delimiter::Whitespace,
|
||||
false => Delimiter::String("\t".to_owned()),
|
||||
},
|
||||
out_delimiter: out_delim,
|
||||
only_delimited,
|
||||
line_ending,
|
||||
},
|
||||
)),
|
||||
}
|
||||
})
|
||||
}
|
||||
(1, None, None, Some(field_ranges)) => list_to_ranges(field_ranges, complement).map(|ranges| {
|
||||
Mode::Fields(
|
||||
ranges,
|
||||
Options {
|
||||
out_delimiter,
|
||||
line_ending,
|
||||
field_opts: Some(FieldOptions {
|
||||
only_delimited,
|
||||
delimiter,
|
||||
})},
|
||||
)
|
||||
}),
|
||||
(2.., _, _, _) => Err(
|
||||
"invalid usage: expects no more than one of --fields (-f), --chars (-c) or --bytes (-b)".into()
|
||||
),
|
||||
|
@ -554,6 +595,7 @@ pub fn uu_app() -> Command {
|
|||
Arg::new(options::DELIMITER)
|
||||
.short('d')
|
||||
.long(options::DELIMITER)
|
||||
.value_parser(ValueParser::os_string())
|
||||
.help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
|
||||
.value_name("DELIM"),
|
||||
)
|
||||
|
@ -596,6 +638,7 @@ pub fn uu_app() -> Command {
|
|||
.arg(
|
||||
Arg::new(options::OUTPUT_DELIMITER)
|
||||
.long(options::OUTPUT_DELIMITER)
|
||||
.value_parser(ValueParser::os_string())
|
||||
.help("in field mode, replace the delimiter in output lines with this option's argument")
|
||||
.value_name("NEW_DELIM"),
|
||||
)
|
||||
|
|
|
@ -288,3 +288,17 @@ fn test_multiple_mode_args() {
|
|||
.stderr_is("cut: invalid usage: expects no more than one of --fields (-f), --chars (-c) or --bytes (-b)\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(unix)]
|
||||
fn test_8bit_non_utf8_delimiter() {
|
||||
use std::ffi::OsStr;
|
||||
use std::os::unix::ffi::OsStrExt;
|
||||
let delim = OsStr::from_bytes(b"\xAD".as_slice());
|
||||
new_ucmd!()
|
||||
.arg("-d")
|
||||
.arg(delim)
|
||||
.args(&["--out=_", "-f2,3", "8bit-delim.txt"])
|
||||
.succeeds()
|
||||
.stdout_check(|out| out == "b_c\n".as_bytes());
|
||||
}
|
||||
|
|
1
tests/fixtures/cut/8bit-delim.txt
vendored
Normal file
1
tests/fixtures/cut/8bit-delim.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
abc
|
Loading…
Reference in a new issue