cut: refactor (#4255)

refactors `cut field` logic to reduce code duplication by factoring out the common `Searcer`, which is _templatized_ on a specific `Matcher` -- `ExactMatcher` for an explicit delimiter and `WhitespaceMatcher` for white-space delimiter.

before
- code duplication in `Searcher` and `WhitespaceSearcher`
- code duplication in `cut_fields` and `cut_fields_whitespace`

after
- two versions of `Matcher`s
- one `Searcher`
- simplify `cut_fields` by delegating actual work to specific functions
This commit is contained in:
TechHara 2023-01-27 15:07:36 -05:00 committed by GitHub
parent 8c6d0e7630
commit 3ad36a49cb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 354 additions and 308 deletions

View file

@ -16,12 +16,12 @@ use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError};
use self::searcher::Searcher;
use self::whitespace_searcher::WhitespaceSearcher;
use matcher::{ExactMatcher, Matcher, WhitespaceMatcher};
use uucore::ranges::Range;
use uucore::{format_usage, show, show_error, show_if_err};
mod matcher;
mod searcher;
mod whitespace_searcher;
static USAGE: &str =
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
@ -188,23 +188,22 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
Ok(())
}
#[allow(clippy::cognitive_complexity)]
fn cut_fields_delimiter<R: Read>(
// Output delimiter is explicitly specified
fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
reader: R,
matcher: &M,
ranges: &[Range],
delim: &str,
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let input_delim_len = delim.len();
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable();
let mut delim_search = Searcher::new(matcher, line).peekable();
let mut print_delim = false;
if delim_search.peek().is_none() {
@ -218,85 +217,6 @@ fn cut_fields_delimiter<R: Read>(
return Ok(true);
}
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some(index) => index + input_delim_len,
None => break,
};
}
for _ in 0..=high - low {
if print_delim {
out.write_all(out_delim.as_bytes())?;
} else {
print_delim = true;
}
match delim_search.next() {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
out.write_all(segment)?;
low_idx = high_idx + input_delim_len;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..];
out.write_all(segment)?;
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});
if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}
Ok(())
}
fn cut_fields_whitespace<R: Read>(
reader: R,
ranges: &[Range],
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = WhitespaceSearcher::new(line).peekable();
let mut print_delim = false;
if delim_search.peek().is_none() {
if !only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}
return Ok(true);
}
// The logic is identical to `cut_fields_delimiter` function above, which uses
// `Searcher` that iterates over and returns the first position of the delimiter character.
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
// delimiter character positions, since each delimiter sequence length can vary.
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
// current field is not in the range, so jump to the field corresponding to the
@ -317,7 +237,7 @@ fn cut_fields_whitespace<R: Read>(
}
match delim_search.next() {
// print the current field up to the next whitespace
// print the current field up to the next field delim
Some((first, last)) => {
let segment = &line[low_idx..first];
@ -352,93 +272,110 @@ fn cut_fields_whitespace<R: Read>(
Ok(())
}
// Output delimiter is the same as input delimiter
fn cut_fields_implicit_out_delim<R: Read, M: Matcher>(
reader: R,
matcher: &M,
ranges: &[Range],
only_delimited: bool,
newline_char: u8,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(matcher, line).peekable();
let mut print_delim = false;
if delim_search.peek().is_none() {
if !only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}
return Ok(true);
}
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
if let Some((first, last)) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim { first } else { last }
} else {
break;
}
}
match delim_search.nth(high - low) {
Some((first, _)) => {
let segment = &line[low_idx..first];
out.write_all(segment)?;
print_delim = true;
low_idx = first;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];
out.write_all(segment)?;
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});
if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}
Ok(())
}
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
match opts.delimiter {
Delimiter::Whitespace => cut_fields_whitespace(
reader,
ranges,
opts.only_delimited,
newline_char,
opts.out_delimiter.as_deref().unwrap_or("\t"),
),
Delimiter::String(ref delimiter) => {
if let Some(ref o_delim) = opts.out_delimiter {
return cut_fields_delimiter(
Delimiter::String(ref delim) => {
let matcher = ExactMatcher::new(delim.as_bytes());
match opts.out_delimiter {
Some(ref out_delim) => cut_fields_explicit_out_delim(
reader,
&matcher,
ranges,
delimiter,
opts.only_delimited,
newline_char,
o_delim,
);
out_delim,
),
None => cut_fields_implicit_out_delim(
reader,
&matcher,
ranges,
opts.only_delimited,
newline_char,
),
}
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim_len = delimiter.len();
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
let mut print_delim = false;
if delim_search.peek().is_none() {
if !opts.only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}
return Ok(true);
}
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim {
delim_pos
} else {
delim_pos + delim_len
}
} else {
break;
}
}
match delim_search.nth(high - low) {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
out.write_all(segment)?;
print_delim = true;
low_idx = high_idx;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];
out.write_all(segment)?;
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});
if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}
Ok(())
}
Delimiter::Whitespace => {
let matcher = WhitespaceMatcher {};
let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t");
cut_fields_explicit_out_delim(
reader,
&matcher,
ranges,
opts.only_delimited,
newline_char,
out_delim,
)
}
}
}

126
src/uu/cut/src/matcher.rs Normal file
View file

@ -0,0 +1,126 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use memchr::{memchr, memchr2};
// Find the next matching byte sequence positions
// Return (first, last) where haystack[first..last] corresponds to the matched pattern
pub trait Matcher {
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)>;
}
// Matches for the exact byte sequence pattern
pub struct ExactMatcher<'a> {
needle: &'a [u8],
}
impl<'a> ExactMatcher<'a> {
pub fn new(needle: &'a [u8]) -> Self {
assert!(!needle.is_empty());
Self { needle }
}
}
impl<'a> Matcher for ExactMatcher<'a> {
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)> {
let mut pos = 0usize;
loop {
match memchr(self.needle[0], &haystack[pos..]) {
Some(match_idx) => {
let match_idx = match_idx + pos; // account for starting from pos
if self.needle.len() == 1
|| haystack[match_idx + 1..].starts_with(&self.needle[1..])
{
return Some((match_idx, match_idx + self.needle.len()));
} else {
pos = match_idx + 1;
}
}
None => {
return None;
}
}
}
}
}
// Matches for any number of SPACE or TAB
pub struct WhitespaceMatcher {}
impl Matcher for WhitespaceMatcher {
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)> {
match memchr2(b' ', b'\t', haystack) {
Some(match_idx) => {
let mut skip = match_idx + 1;
while skip < haystack.len() {
match haystack[skip] {
b' ' | b'\t' => skip += 1,
_ => break,
}
}
Some((match_idx, skip))
}
None => None,
}
}
}
#[cfg(test)]
mod matcher_tests {
use super::*;
#[test]
fn test_exact_matcher_single_byte() {
let matcher = ExactMatcher::new(":".as_bytes());
// spell-checker:disable
assert_eq!(matcher.next_match("".as_bytes()), None);
assert_eq!(matcher.next_match(":".as_bytes()), Some((0, 1)));
assert_eq!(matcher.next_match(":abcxyz".as_bytes()), Some((0, 1)));
assert_eq!(matcher.next_match("abc:xyz".as_bytes()), Some((3, 4)));
assert_eq!(matcher.next_match("abcxyz:".as_bytes()), Some((6, 7)));
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
// spell-checker:enable
}
#[test]
fn test_exact_matcher_multi_bytes() {
let matcher = ExactMatcher::new("<>".as_bytes());
// spell-checker:disable
assert_eq!(matcher.next_match("".as_bytes()), None);
assert_eq!(matcher.next_match("<>".as_bytes()), Some((0, 2)));
assert_eq!(matcher.next_match("<>abcxyz".as_bytes()), Some((0, 2)));
assert_eq!(matcher.next_match("abc<>xyz".as_bytes()), Some((3, 5)));
assert_eq!(matcher.next_match("abcxyz<>".as_bytes()), Some((6, 8)));
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
// spell-checker:enable
}
#[test]
fn test_whitespace_matcher_single_space() {
let matcher = WhitespaceMatcher {};
// spell-checker:disable
assert_eq!(matcher.next_match("".as_bytes()), None);
assert_eq!(matcher.next_match(" ".as_bytes()), Some((0, 1)));
assert_eq!(matcher.next_match("\tabcxyz".as_bytes()), Some((0, 1)));
assert_eq!(matcher.next_match("abc\txyz".as_bytes()), Some((3, 4)));
assert_eq!(matcher.next_match("abcxyz ".as_bytes()), Some((6, 7)));
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
// spell-checker:enable
}
#[test]
fn test_whitespace_matcher_multi_spaces() {
let matcher = WhitespaceMatcher {};
// spell-checker:disable
assert_eq!(matcher.next_match("".as_bytes()), None);
assert_eq!(matcher.next_match(" \t ".as_bytes()), Some((0, 3)));
assert_eq!(matcher.next_match("\t\tabcxyz".as_bytes()), Some((0, 2)));
assert_eq!(matcher.next_match("abc \txyz".as_bytes()), Some((3, 5)));
assert_eq!(matcher.next_match("abcxyz ".as_bytes()), Some((6, 8)));
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
// spell-checker:enable
}
}

View file

@ -5,82 +5,77 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use memchr::memchr;
// spell-checker:ignore multispace
pub struct Searcher<'a> {
haystack: &'a [u8],
needle: &'a [u8],
use super::matcher::Matcher;
// Generic searcher that relies on a specific matcher
pub struct Searcher<'a, 'b, M: Matcher> {
matcher: &'a M,
haystack: &'b [u8],
position: usize,
}
impl<'a> Searcher<'a> {
pub fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> {
assert!(!needle.is_empty());
Searcher {
impl<'a, 'b, M: Matcher> Searcher<'a, 'b, M> {
pub fn new(matcher: &'a M, haystack: &'b [u8]) -> Self {
Self {
matcher,
haystack,
needle,
position: 0,
}
}
}
impl<'a> Iterator for Searcher<'a> {
type Item = usize;
// Iterate over field delimiters
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
// corresponds to the delimiter.
impl<'a, 'b, M: Matcher> Iterator for Searcher<'a, 'b, M> {
type Item = (usize, usize);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(match_idx) = memchr(self.needle[0], self.haystack) {
if self.needle.len() == 1
|| self.haystack[match_idx + 1..].starts_with(&self.needle[1..])
{
let match_pos = self.position + match_idx;
let skip = match_idx + self.needle.len();
self.haystack = &self.haystack[skip..];
self.position += skip;
return Some(match_pos);
} else {
let skip = match_idx + 1;
self.haystack = &self.haystack[skip..];
self.position += skip;
// continue
}
} else {
return None;
match self.matcher.next_match(&self.haystack[self.position..]) {
Some((first, last)) => {
let result = (first + self.position, last + self.position);
self.position += last;
Some(result)
}
None => None,
}
}
}
#[cfg(test)]
mod tests {
mod exact_searcher_tests {
use super::super::matcher::ExactMatcher;
use super::*;
const NEEDLE: &[u8] = "ab".as_bytes();
#[test]
fn test_normal() {
let iter = Searcher::new("a.a.a".as_bytes(), "a".as_bytes());
let items: Vec<usize> = iter.collect();
assert_eq!(vec![0, 2, 4], items);
let matcher = ExactMatcher::new("a".as_bytes());
let iter = Searcher::new(&matcher, "a.a.a".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_empty() {
let iter = Searcher::new("".as_bytes(), "a".as_bytes());
let items: Vec<usize> = iter.collect();
assert_eq!(vec![] as Vec<usize>, items);
let matcher = ExactMatcher::new("a".as_bytes());
let iter = Searcher::new(&matcher, "".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![] as Vec<(usize, usize)>, items);
}
fn test_multibyte(line: &[u8], expected: &[usize]) {
let iter = Searcher::new(line, NEEDLE);
let items: Vec<usize> = iter.collect();
fn test_multibyte(line: &[u8], expected: &[(usize, usize)]) {
let matcher = ExactMatcher::new("ab".as_bytes());
let iter = Searcher::new(&matcher, line);
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(expected, items);
}
#[test]
fn test_multibyte_normal() {
test_multibyte("...ab...ab...".as_bytes(), &[3, 8]);
test_multibyte("...ab...ab...".as_bytes(), &[(3, 5), (8, 10)]);
}
#[test]
@ -90,16 +85,101 @@ mod tests {
#[test]
fn test_multibyte_starting_needle() {
test_multibyte("ab...ab...".as_bytes(), &[0, 5]);
test_multibyte("ab...ab...".as_bytes(), &[(0, 2), (5, 7)]);
}
#[test]
fn test_multibyte_trailing_needle() {
test_multibyte("...ab...ab".as_bytes(), &[3, 8]);
test_multibyte("...ab...ab".as_bytes(), &[(3, 5), (8, 10)]);
}
#[test]
fn test_multibyte_first_byte_false_match() {
test_multibyte("aA..aCaC..ab..aD".as_bytes(), &[10]);
test_multibyte("aA..aCaC..ab..aD".as_bytes(), &[(10, 12)]);
}
#[test]
fn test_searcher_with_exact_matcher() {
let matcher = ExactMatcher::new("<>".as_bytes());
let haystack = "<><>a<>b<><>cd<><>".as_bytes();
let mut searcher = Searcher::new(&matcher, haystack);
assert_eq!(searcher.next(), Some((0, 2)));
assert_eq!(searcher.next(), Some((2, 4)));
assert_eq!(searcher.next(), Some((5, 7)));
assert_eq!(searcher.next(), Some((8, 10)));
assert_eq!(searcher.next(), Some((10, 12)));
assert_eq!(searcher.next(), Some((14, 16)));
assert_eq!(searcher.next(), Some((16, 18)));
assert_eq!(searcher.next(), None);
assert_eq!(searcher.next(), None);
}
}
#[cfg(test)]
mod whitespace_searcher_tests {
use super::super::matcher::WhitespaceMatcher;
use super::*;
#[test]
fn test_space() {
let matcher = WhitespaceMatcher {};
let iter = Searcher::new(&matcher, " . . ".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_tab() {
let matcher = WhitespaceMatcher {};
let iter = Searcher::new(&matcher, "\t.\t.\t".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_empty() {
let matcher = WhitespaceMatcher {};
let iter = Searcher::new(&matcher, "".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![] as Vec<(usize, usize)>, items);
}
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
let matcher = WhitespaceMatcher {};
let iter = Searcher::new(&matcher, line);
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(expected, items);
}
#[test]
fn test_multispace_normal() {
test_multispace(
"... ... \t...\t ... \t ...".as_bytes(),
&[(3, 5), (8, 10), (13, 15), (18, 21)],
);
}
#[test]
fn test_multispace_begin() {
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
}
#[test]
fn test_multispace_end() {
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
}
#[test]
fn test_searcher_with_whitespace_matcher() {
let matcher = WhitespaceMatcher {};
let haystack = "\t a b \t cd\t\t".as_bytes();
let mut searcher = Searcher::new(&matcher, haystack);
assert_eq!(searcher.next(), Some((0, 2)));
assert_eq!(searcher.next(), Some((3, 4)));
assert_eq!(searcher.next(), Some((5, 8)));
assert_eq!(searcher.next(), Some((10, 12)));
assert_eq!(searcher.next(), None);
assert_eq!(searcher.next(), None);
}
}

View file

@ -1,97 +0,0 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore multispace
use memchr::memchr2;
pub struct WhitespaceSearcher<'a> {
haystack: &'a [u8],
position: usize,
}
impl<'a> WhitespaceSearcher<'a> {
pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
WhitespaceSearcher {
haystack,
position: 0,
}
}
}
impl<'a> Iterator for WhitespaceSearcher<'a> {
type Item = (usize, usize);
// Iterate over sequences of consecutive whitespace (space and/or tab) characters.
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
// corresponds to the delimiter.
fn next(&mut self) -> Option<Self::Item> {
if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
let mut skip = match_idx + 1;
while skip < self.haystack.len()
&& (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
{
skip += 1;
}
let match_pos = self.position + match_idx;
self.haystack = &self.haystack[skip..];
self.position += skip;
Some((match_pos, self.position))
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_space() {
let iter = WhitespaceSearcher::new(" . . ".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_tab() {
let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_empty() {
let iter = WhitespaceSearcher::new("".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![] as Vec<(usize, usize)>, items);
}
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
let iter = WhitespaceSearcher::new(line);
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(expected, items);
}
#[test]
fn test_multispace_normal() {
test_multispace(
"... ... \t...\t ... \t ...".as_bytes(),
&[(3, 5), (8, 10), (13, 15), (18, 21)],
);
}
#[test]
fn test_multispace_begin() {
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
}
#[test]
fn test_multispace_end() {
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
}
}