mirror of
https://github.com/uutils/coreutils
synced 2024-09-16 06:21:12 +00:00
cut: refactor (#4255)
refactors `cut field` logic to reduce code duplication by factoring out the common `Searcer`, which is _templatized_ on a specific `Matcher` -- `ExactMatcher` for an explicit delimiter and `WhitespaceMatcher` for white-space delimiter. before - code duplication in `Searcher` and `WhitespaceSearcher` - code duplication in `cut_fields` and `cut_fields_whitespace` after - two versions of `Matcher`s - one `Searcher` - simplify `cut_fields` by delegating actual work to specific functions
This commit is contained in:
parent
8c6d0e7630
commit
3ad36a49cb
|
@ -16,12 +16,12 @@ use uucore::display::Quotable;
|
|||
use uucore::error::{FromIo, UResult, USimpleError};
|
||||
|
||||
use self::searcher::Searcher;
|
||||
use self::whitespace_searcher::WhitespaceSearcher;
|
||||
use matcher::{ExactMatcher, Matcher, WhitespaceMatcher};
|
||||
use uucore::ranges::Range;
|
||||
use uucore::{format_usage, show, show_error, show_if_err};
|
||||
|
||||
mod matcher;
|
||||
mod searcher;
|
||||
mod whitespace_searcher;
|
||||
|
||||
static USAGE: &str =
|
||||
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
|
||||
|
@ -188,23 +188,22 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::cognitive_complexity)]
|
||||
fn cut_fields_delimiter<R: Read>(
|
||||
// Output delimiter is explicitly specified
|
||||
fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
|
||||
reader: R,
|
||||
matcher: &M,
|
||||
ranges: &[Range],
|
||||
delim: &str,
|
||||
only_delimited: bool,
|
||||
newline_char: u8,
|
||||
out_delim: &str,
|
||||
) -> UResult<()> {
|
||||
let mut buf_in = BufReader::new(reader);
|
||||
let mut out = stdout_writer();
|
||||
let input_delim_len = delim.len();
|
||||
|
||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||
let mut fields_pos = 1;
|
||||
let mut low_idx = 0;
|
||||
let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable();
|
||||
let mut delim_search = Searcher::new(matcher, line).peekable();
|
||||
let mut print_delim = false;
|
||||
|
||||
if delim_search.peek().is_none() {
|
||||
|
@ -218,85 +217,6 @@ fn cut_fields_delimiter<R: Read>(
|
|||
return Ok(true);
|
||||
}
|
||||
|
||||
for &Range { low, high } in ranges {
|
||||
if low - fields_pos > 0 {
|
||||
low_idx = match delim_search.nth(low - fields_pos - 1) {
|
||||
Some(index) => index + input_delim_len,
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
for _ in 0..=high - low {
|
||||
if print_delim {
|
||||
out.write_all(out_delim.as_bytes())?;
|
||||
} else {
|
||||
print_delim = true;
|
||||
}
|
||||
|
||||
match delim_search.next() {
|
||||
Some(high_idx) => {
|
||||
let segment = &line[low_idx..high_idx];
|
||||
|
||||
out.write_all(segment)?;
|
||||
|
||||
low_idx = high_idx + input_delim_len;
|
||||
fields_pos = high + 1;
|
||||
}
|
||||
None => {
|
||||
let segment = &line[low_idx..];
|
||||
|
||||
out.write_all(segment)?;
|
||||
|
||||
if line[line.len() - 1] == newline_char {
|
||||
return Ok(true);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.write_all(&[newline_char])?;
|
||||
Ok(true)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
return Err(USimpleError::new(1, e.to_string()));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cut_fields_whitespace<R: Read>(
|
||||
reader: R,
|
||||
ranges: &[Range],
|
||||
only_delimited: bool,
|
||||
newline_char: u8,
|
||||
out_delim: &str,
|
||||
) -> UResult<()> {
|
||||
let mut buf_in = BufReader::new(reader);
|
||||
let mut out = stdout_writer();
|
||||
|
||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||
let mut fields_pos = 1;
|
||||
let mut low_idx = 0;
|
||||
let mut delim_search = WhitespaceSearcher::new(line).peekable();
|
||||
let mut print_delim = false;
|
||||
|
||||
if delim_search.peek().is_none() {
|
||||
if !only_delimited {
|
||||
out.write_all(line)?;
|
||||
if line[line.len() - 1] != newline_char {
|
||||
out.write_all(&[newline_char])?;
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(true);
|
||||
}
|
||||
// The logic is identical to `cut_fields_delimiter` function above, which uses
|
||||
// `Searcher` that iterates over and returns the first position of the delimiter character.
|
||||
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
|
||||
// delimiter character positions, since each delimiter sequence length can vary.
|
||||
for &Range { low, high } in ranges {
|
||||
if low - fields_pos > 0 {
|
||||
// current field is not in the range, so jump to the field corresponding to the
|
||||
|
@ -317,7 +237,7 @@ fn cut_fields_whitespace<R: Read>(
|
|||
}
|
||||
|
||||
match delim_search.next() {
|
||||
// print the current field up to the next whitespace
|
||||
// print the current field up to the next field delim
|
||||
Some((first, last)) => {
|
||||
let segment = &line[low_idx..first];
|
||||
|
||||
|
@ -352,40 +272,25 @@ fn cut_fields_whitespace<R: Read>(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
||||
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
|
||||
match opts.delimiter {
|
||||
Delimiter::Whitespace => cut_fields_whitespace(
|
||||
reader,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
newline_char,
|
||||
opts.out_delimiter.as_deref().unwrap_or("\t"),
|
||||
),
|
||||
Delimiter::String(ref delimiter) => {
|
||||
if let Some(ref o_delim) = opts.out_delimiter {
|
||||
return cut_fields_delimiter(
|
||||
reader,
|
||||
ranges,
|
||||
delimiter,
|
||||
opts.only_delimited,
|
||||
newline_char,
|
||||
o_delim,
|
||||
);
|
||||
}
|
||||
|
||||
// Output delimiter is the same as input delimiter
|
||||
fn cut_fields_implicit_out_delim<R: Read, M: Matcher>(
|
||||
reader: R,
|
||||
matcher: &M,
|
||||
ranges: &[Range],
|
||||
only_delimited: bool,
|
||||
newline_char: u8,
|
||||
) -> UResult<()> {
|
||||
let mut buf_in = BufReader::new(reader);
|
||||
let mut out = stdout_writer();
|
||||
let delim_len = delimiter.len();
|
||||
|
||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||
let mut fields_pos = 1;
|
||||
let mut low_idx = 0;
|
||||
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
|
||||
let mut delim_search = Searcher::new(matcher, line).peekable();
|
||||
let mut print_delim = false;
|
||||
|
||||
if delim_search.peek().is_none() {
|
||||
if !opts.only_delimited {
|
||||
if !only_delimited {
|
||||
out.write_all(line)?;
|
||||
if line[line.len() - 1] != newline_char {
|
||||
out.write_all(&[newline_char])?;
|
||||
|
@ -397,25 +302,21 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
|||
|
||||
for &Range { low, high } in ranges {
|
||||
if low - fields_pos > 0 {
|
||||
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
|
||||
low_idx = if print_delim {
|
||||
delim_pos
|
||||
} else {
|
||||
delim_pos + delim_len
|
||||
}
|
||||
if let Some((first, last)) = delim_search.nth(low - fields_pos - 1) {
|
||||
low_idx = if print_delim { first } else { last }
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
match delim_search.nth(high - low) {
|
||||
Some(high_idx) => {
|
||||
let segment = &line[low_idx..high_idx];
|
||||
Some((first, _)) => {
|
||||
let segment = &line[low_idx..first];
|
||||
|
||||
out.write_all(segment)?;
|
||||
|
||||
print_delim = true;
|
||||
low_idx = high_idx;
|
||||
low_idx = first;
|
||||
fields_pos = high + 1;
|
||||
}
|
||||
None => {
|
||||
|
@ -439,6 +340,42 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
|||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
||||
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
|
||||
match opts.delimiter {
|
||||
Delimiter::String(ref delim) => {
|
||||
let matcher = ExactMatcher::new(delim.as_bytes());
|
||||
match opts.out_delimiter {
|
||||
Some(ref out_delim) => cut_fields_explicit_out_delim(
|
||||
reader,
|
||||
&matcher,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
newline_char,
|
||||
out_delim,
|
||||
),
|
||||
None => cut_fields_implicit_out_delim(
|
||||
reader,
|
||||
&matcher,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
newline_char,
|
||||
),
|
||||
}
|
||||
}
|
||||
Delimiter::Whitespace => {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t");
|
||||
cut_fields_explicit_out_delim(
|
||||
reader,
|
||||
&matcher,
|
||||
ranges,
|
||||
opts.only_delimited,
|
||||
newline_char,
|
||||
out_delim,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
126
src/uu/cut/src/matcher.rs
Normal file
126
src/uu/cut/src/matcher.rs
Normal file
|
@ -0,0 +1,126 @@
|
|||
// This file is part of the uutils coreutils package.
|
||||
//
|
||||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
|
||||
use memchr::{memchr, memchr2};
|
||||
|
||||
// Find the next matching byte sequence positions
|
||||
// Return (first, last) where haystack[first..last] corresponds to the matched pattern
|
||||
pub trait Matcher {
|
||||
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)>;
|
||||
}
|
||||
|
||||
// Matches for the exact byte sequence pattern
|
||||
pub struct ExactMatcher<'a> {
|
||||
needle: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> ExactMatcher<'a> {
|
||||
pub fn new(needle: &'a [u8]) -> Self {
|
||||
assert!(!needle.is_empty());
|
||||
Self { needle }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Matcher for ExactMatcher<'a> {
|
||||
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)> {
|
||||
let mut pos = 0usize;
|
||||
loop {
|
||||
match memchr(self.needle[0], &haystack[pos..]) {
|
||||
Some(match_idx) => {
|
||||
let match_idx = match_idx + pos; // account for starting from pos
|
||||
if self.needle.len() == 1
|
||||
|| haystack[match_idx + 1..].starts_with(&self.needle[1..])
|
||||
{
|
||||
return Some((match_idx, match_idx + self.needle.len()));
|
||||
} else {
|
||||
pos = match_idx + 1;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Matches for any number of SPACE or TAB
|
||||
pub struct WhitespaceMatcher {}
|
||||
|
||||
impl Matcher for WhitespaceMatcher {
|
||||
fn next_match(&self, haystack: &[u8]) -> Option<(usize, usize)> {
|
||||
match memchr2(b' ', b'\t', haystack) {
|
||||
Some(match_idx) => {
|
||||
let mut skip = match_idx + 1;
|
||||
while skip < haystack.len() {
|
||||
match haystack[skip] {
|
||||
b' ' | b'\t' => skip += 1,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
Some((match_idx, skip))
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod matcher_tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_exact_matcher_single_byte() {
|
||||
let matcher = ExactMatcher::new(":".as_bytes());
|
||||
// spell-checker:disable
|
||||
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||
assert_eq!(matcher.next_match(":".as_bytes()), Some((0, 1)));
|
||||
assert_eq!(matcher.next_match(":abcxyz".as_bytes()), Some((0, 1)));
|
||||
assert_eq!(matcher.next_match("abc:xyz".as_bytes()), Some((3, 4)));
|
||||
assert_eq!(matcher.next_match("abcxyz:".as_bytes()), Some((6, 7)));
|
||||
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||
// spell-checker:enable
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exact_matcher_multi_bytes() {
|
||||
let matcher = ExactMatcher::new("<>".as_bytes());
|
||||
// spell-checker:disable
|
||||
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||
assert_eq!(matcher.next_match("<>".as_bytes()), Some((0, 2)));
|
||||
assert_eq!(matcher.next_match("<>abcxyz".as_bytes()), Some((0, 2)));
|
||||
assert_eq!(matcher.next_match("abc<>xyz".as_bytes()), Some((3, 5)));
|
||||
assert_eq!(matcher.next_match("abcxyz<>".as_bytes()), Some((6, 8)));
|
||||
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||
// spell-checker:enable
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_matcher_single_space() {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
// spell-checker:disable
|
||||
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||
assert_eq!(matcher.next_match(" ".as_bytes()), Some((0, 1)));
|
||||
assert_eq!(matcher.next_match("\tabcxyz".as_bytes()), Some((0, 1)));
|
||||
assert_eq!(matcher.next_match("abc\txyz".as_bytes()), Some((3, 4)));
|
||||
assert_eq!(matcher.next_match("abcxyz ".as_bytes()), Some((6, 7)));
|
||||
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||
// spell-checker:enable
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_matcher_multi_spaces() {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
// spell-checker:disable
|
||||
assert_eq!(matcher.next_match("".as_bytes()), None);
|
||||
assert_eq!(matcher.next_match(" \t ".as_bytes()), Some((0, 3)));
|
||||
assert_eq!(matcher.next_match("\t\tabcxyz".as_bytes()), Some((0, 2)));
|
||||
assert_eq!(matcher.next_match("abc \txyz".as_bytes()), Some((3, 5)));
|
||||
assert_eq!(matcher.next_match("abcxyz ".as_bytes()), Some((6, 8)));
|
||||
assert_eq!(matcher.next_match("abcxyz".as_bytes()), None);
|
||||
// spell-checker:enable
|
||||
}
|
||||
}
|
|
@ -5,82 +5,77 @@
|
|||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
|
||||
use memchr::memchr;
|
||||
// spell-checker:ignore multispace
|
||||
|
||||
pub struct Searcher<'a> {
|
||||
haystack: &'a [u8],
|
||||
needle: &'a [u8],
|
||||
use super::matcher::Matcher;
|
||||
|
||||
// Generic searcher that relies on a specific matcher
|
||||
pub struct Searcher<'a, 'b, M: Matcher> {
|
||||
matcher: &'a M,
|
||||
haystack: &'b [u8],
|
||||
position: usize,
|
||||
}
|
||||
|
||||
impl<'a> Searcher<'a> {
|
||||
pub fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> {
|
||||
assert!(!needle.is_empty());
|
||||
Searcher {
|
||||
impl<'a, 'b, M: Matcher> Searcher<'a, 'b, M> {
|
||||
pub fn new(matcher: &'a M, haystack: &'b [u8]) -> Self {
|
||||
Self {
|
||||
matcher,
|
||||
haystack,
|
||||
needle,
|
||||
position: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Searcher<'a> {
|
||||
type Item = usize;
|
||||
// Iterate over field delimiters
|
||||
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
|
||||
// corresponds to the delimiter.
|
||||
impl<'a, 'b, M: Matcher> Iterator for Searcher<'a, 'b, M> {
|
||||
type Item = (usize, usize);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(match_idx) = memchr(self.needle[0], self.haystack) {
|
||||
if self.needle.len() == 1
|
||||
|| self.haystack[match_idx + 1..].starts_with(&self.needle[1..])
|
||||
{
|
||||
let match_pos = self.position + match_idx;
|
||||
let skip = match_idx + self.needle.len();
|
||||
self.haystack = &self.haystack[skip..];
|
||||
self.position += skip;
|
||||
return Some(match_pos);
|
||||
} else {
|
||||
let skip = match_idx + 1;
|
||||
self.haystack = &self.haystack[skip..];
|
||||
self.position += skip;
|
||||
// continue
|
||||
}
|
||||
} else {
|
||||
return None;
|
||||
match self.matcher.next_match(&self.haystack[self.position..]) {
|
||||
Some((first, last)) => {
|
||||
let result = (first + self.position, last + self.position);
|
||||
self.position += last;
|
||||
Some(result)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
mod exact_searcher_tests {
|
||||
|
||||
use super::super::matcher::ExactMatcher;
|
||||
use super::*;
|
||||
|
||||
const NEEDLE: &[u8] = "ab".as_bytes();
|
||||
|
||||
#[test]
|
||||
fn test_normal() {
|
||||
let iter = Searcher::new("a.a.a".as_bytes(), "a".as_bytes());
|
||||
let items: Vec<usize> = iter.collect();
|
||||
assert_eq!(vec![0, 2, 4], items);
|
||||
let matcher = ExactMatcher::new("a".as_bytes());
|
||||
let iter = Searcher::new(&matcher, "a.a.a".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let iter = Searcher::new("".as_bytes(), "a".as_bytes());
|
||||
let items: Vec<usize> = iter.collect();
|
||||
assert_eq!(vec![] as Vec<usize>, items);
|
||||
let matcher = ExactMatcher::new("a".as_bytes());
|
||||
let iter = Searcher::new(&matcher, "".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
||||
}
|
||||
|
||||
fn test_multibyte(line: &[u8], expected: &[usize]) {
|
||||
let iter = Searcher::new(line, NEEDLE);
|
||||
let items: Vec<usize> = iter.collect();
|
||||
fn test_multibyte(line: &[u8], expected: &[(usize, usize)]) {
|
||||
let matcher = ExactMatcher::new("ab".as_bytes());
|
||||
let iter = Searcher::new(&matcher, line);
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(expected, items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibyte_normal() {
|
||||
test_multibyte("...ab...ab...".as_bytes(), &[3, 8]);
|
||||
test_multibyte("...ab...ab...".as_bytes(), &[(3, 5), (8, 10)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -90,16 +85,101 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_multibyte_starting_needle() {
|
||||
test_multibyte("ab...ab...".as_bytes(), &[0, 5]);
|
||||
test_multibyte("ab...ab...".as_bytes(), &[(0, 2), (5, 7)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibyte_trailing_needle() {
|
||||
test_multibyte("...ab...ab".as_bytes(), &[3, 8]);
|
||||
test_multibyte("...ab...ab".as_bytes(), &[(3, 5), (8, 10)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibyte_first_byte_false_match() {
|
||||
test_multibyte("aA..aCaC..ab..aD".as_bytes(), &[10]);
|
||||
test_multibyte("aA..aCaC..ab..aD".as_bytes(), &[(10, 12)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher_with_exact_matcher() {
|
||||
let matcher = ExactMatcher::new("<>".as_bytes());
|
||||
let haystack = "<><>a<>b<><>cd<><>".as_bytes();
|
||||
let mut searcher = Searcher::new(&matcher, haystack);
|
||||
assert_eq!(searcher.next(), Some((0, 2)));
|
||||
assert_eq!(searcher.next(), Some((2, 4)));
|
||||
assert_eq!(searcher.next(), Some((5, 7)));
|
||||
assert_eq!(searcher.next(), Some((8, 10)));
|
||||
assert_eq!(searcher.next(), Some((10, 12)));
|
||||
assert_eq!(searcher.next(), Some((14, 16)));
|
||||
assert_eq!(searcher.next(), Some((16, 18)));
|
||||
assert_eq!(searcher.next(), None);
|
||||
assert_eq!(searcher.next(), None);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod whitespace_searcher_tests {
|
||||
|
||||
use super::super::matcher::WhitespaceMatcher;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_space() {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let iter = Searcher::new(&matcher, " . . ".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tab() {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let iter = Searcher::new(&matcher, "\t.\t.\t".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let iter = Searcher::new(&matcher, "".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
||||
}
|
||||
|
||||
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let iter = Searcher::new(&matcher, line);
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(expected, items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multispace_normal() {
|
||||
test_multispace(
|
||||
"... ... \t...\t ... \t ...".as_bytes(),
|
||||
&[(3, 5), (8, 10), (13, 15), (18, 21)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multispace_begin() {
|
||||
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multispace_end() {
|
||||
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher_with_whitespace_matcher() {
|
||||
let matcher = WhitespaceMatcher {};
|
||||
let haystack = "\t a b \t cd\t\t".as_bytes();
|
||||
let mut searcher = Searcher::new(&matcher, haystack);
|
||||
assert_eq!(searcher.next(), Some((0, 2)));
|
||||
assert_eq!(searcher.next(), Some((3, 4)));
|
||||
assert_eq!(searcher.next(), Some((5, 8)));
|
||||
assert_eq!(searcher.next(), Some((10, 12)));
|
||||
assert_eq!(searcher.next(), None);
|
||||
assert_eq!(searcher.next(), None);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,97 +0,0 @@
|
|||
// This file is part of the uutils coreutils package.
|
||||
//
|
||||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore multispace
|
||||
|
||||
use memchr::memchr2;
|
||||
|
||||
pub struct WhitespaceSearcher<'a> {
|
||||
haystack: &'a [u8],
|
||||
position: usize,
|
||||
}
|
||||
|
||||
impl<'a> WhitespaceSearcher<'a> {
|
||||
pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
|
||||
WhitespaceSearcher {
|
||||
haystack,
|
||||
position: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for WhitespaceSearcher<'a> {
|
||||
type Item = (usize, usize);
|
||||
|
||||
// Iterate over sequences of consecutive whitespace (space and/or tab) characters.
|
||||
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
|
||||
// corresponds to the delimiter.
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
|
||||
let mut skip = match_idx + 1;
|
||||
while skip < self.haystack.len()
|
||||
&& (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
|
||||
{
|
||||
skip += 1;
|
||||
}
|
||||
let match_pos = self.position + match_idx;
|
||||
self.haystack = &self.haystack[skip..];
|
||||
self.position += skip;
|
||||
Some((match_pos, self.position))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_space() {
|
||||
let iter = WhitespaceSearcher::new(" . . ".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tab() {
|
||||
let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let iter = WhitespaceSearcher::new("".as_bytes());
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
||||
}
|
||||
|
||||
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
|
||||
let iter = WhitespaceSearcher::new(line);
|
||||
let items: Vec<(usize, usize)> = iter.collect();
|
||||
assert_eq!(expected, items);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multispace_normal() {
|
||||
test_multispace(
|
||||
"... ... \t...\t ... \t ...".as_bytes(),
|
||||
&[(3, 5), (8, 10), (13, 15), (18, 21)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multispace_begin() {
|
||||
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multispace_end() {
|
||||
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue