cut: optimizations

* Use buffered stdout to reduce write sys calls.

This simple change yielded the biggest performace gain.

* Use `for_byte_record_with_terminator` from the `bstr` crate.

This is to minimize the per line copying needed by
`BufReader::read_until`. The `cut_fields` and `cut_fields_delimiter`
functions used `read_until` to iterate over lines. That required copying
each input line to the line buffer. With
`for_byte_record_with_terminator` copying is minimized as it calls our
closure with a reference to BufReader's buffer most of the time.  It
needs to copy (internally) only to process any incomplete lines at the
end of the buffer.

* Re-write `Searcher` to use `memchr`.

Switch from the naive implementation to one that uses `memchr`.

* Rewrite `cut_bytes` almost entirely.

This was already well optimized. The performance gain in this case is
not from avoiding copying. In fact, it needed zero copying whereas new
implementation introduces some copying similar to `cut_fields` described
above. But the occassional copying cost is more than offset by the use
of the very fast `memchr` inside `for_byte_record_with_terminator`.
This change also simplifies the code significantly. Removed the `buffer`
module.
This commit is contained in:
Chirag Jadwani 2021-04-24 21:34:42 +05:30
parent 2f17bfc14c
commit 2c1459cbfc
5 changed files with 157 additions and 302 deletions

2
Cargo.lock generated
View file

@ -1777,7 +1777,9 @@ dependencies = [
name = "uu_cut"
version = "0.0.6"
dependencies = [
"bstr",
"clap",
"memchr 2.3.4",
"uucore",
"uucore_procs",
]

View file

@ -18,6 +18,8 @@ path = "src/cut.rs"
clap = "2.33"
uucore = { version=">=0.0.8", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
memchr = "2"
bstr = "0.2"
[[bin]]
name = "cut"

View file

@ -1,152 +0,0 @@
// This file is part of the uutils coreutils package.
//
// (c) Rolf Morel <rolfmorel@gmail.com>
// (c) kwantam <kwantam@gmail.com>
// * substantial rewrite to use the `std::io::BufReader` trait
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (ToDO) SRes Newl
use std::io::Result as IoResult;
use std::io::{BufRead, BufReader, Read, Write};
#[allow(non_snake_case)]
pub mod Bytes {
use std::io::Write;
pub trait Select {
fn select<W: Write>(&mut self, bytes: usize, out: Option<&mut W>) -> Selected;
}
#[derive(PartialEq, Eq, Debug)]
pub enum Selected {
NewlineFound,
Complete(usize),
Partial(usize),
EndOfFile,
}
}
#[derive(Debug)]
pub struct ByteReader<R>
where
R: Read,
{
inner: BufReader<R>,
newline_char: u8,
}
impl<R: Read> ByteReader<R> {
pub fn new(read: R, newline_char: u8) -> ByteReader<R> {
ByteReader {
inner: BufReader::with_capacity(4096, read),
newline_char,
}
}
}
impl<R: Read> Read for ByteReader<R> {
fn read(&mut self, buf: &mut [u8]) -> IoResult<usize> {
self.inner.read(buf)
}
}
impl<R: Read> BufRead for ByteReader<R> {
fn fill_buf(&mut self) -> IoResult<&[u8]> {
self.inner.fill_buf()
}
fn consume(&mut self, amt: usize) {
self.inner.consume(amt)
}
}
impl<R: Read> ByteReader<R> {
pub fn consume_line(&mut self) -> usize {
let mut bytes_consumed = 0;
let mut consume_val;
let newline_char = self.newline_char;
loop {
{
// need filled_buf to go out of scope
let filled_buf = match self.fill_buf() {
Ok(b) => {
if b.is_empty() {
return bytes_consumed;
} else {
b
}
}
Err(e) => crash!(1, "read error: {}", e),
};
if let Some(idx) = filled_buf.iter().position(|byte| *byte == newline_char) {
consume_val = idx + 1;
bytes_consumed += consume_val;
break;
}
consume_val = filled_buf.len();
}
bytes_consumed += consume_val;
self.consume(consume_val);
}
self.consume(consume_val);
bytes_consumed
}
}
impl<R: Read> self::Bytes::Select for ByteReader<R> {
fn select<W: Write>(&mut self, bytes: usize, out: Option<&mut W>) -> Bytes::Selected {
enum SRes {
Comp,
Part,
Newl,
}
use self::Bytes::Selected::*;
let newline_char = self.newline_char;
let (res, consume_val) = {
let buffer = match self.fill_buf() {
Err(e) => crash!(1, "read error: {}", e),
Ok(b) => b,
};
let (res, consume_val) = match buffer.len() {
0 => return EndOfFile,
buf_used if bytes < buf_used => {
// because the output delimiter should only be placed between
// segments check if the byte after bytes is a newline
let buf_slice = &buffer[0..=bytes];
match buf_slice.iter().position(|byte| *byte == newline_char) {
Some(idx) => (SRes::Newl, idx + 1),
None => (SRes::Comp, bytes),
}
}
_ => match buffer.iter().position(|byte| *byte == newline_char) {
Some(idx) => (SRes::Newl, idx + 1),
None => (SRes::Part, buffer.len()),
},
};
if let Some(out) = out {
crash_if_err!(1, out.write_all(&buffer[0..consume_val]));
}
(res, consume_val)
};
self.consume(consume_val);
match res {
SRes::Comp => Complete(consume_val),
SRes::Part => Partial(consume_val),
SRes::Newl => NewlineFound,
}
}
}

View file

@ -10,15 +10,16 @@
#[macro_use]
extern crate uucore;
use bstr::io::BufReadExt;
use clap::{App, Arg};
use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, Read, Stdout, Write};
use std::io::{stdin, stdout, BufReader, BufWriter, Read, Write};
use std::path::Path;
use self::searcher::Searcher;
use uucore::fs::is_stdout_interactive;
use uucore::ranges::Range;
mod buffer;
mod searcher;
static NAME: &str = "cut";
@ -125,6 +126,14 @@ enum Mode {
Fields(Vec<Range>, FieldOptions),
}
fn stdout_writer() -> Box<dyn Write> {
if is_stdout_interactive() {
Box::new(stdout())
} else {
Box::new(BufWriter::new(stdout())) as Box<dyn Write>
}
}
fn list_to_ranges(list: &str, complement: bool) -> Result<Vec<Range>, String> {
if complement {
Range::from_list(list).map(|r| uucore::ranges::complement(&r))
@ -134,72 +143,35 @@ fn list_to_ranges(list: &str, complement: bool) -> Result<Vec<Range>, String> {
}
fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
use self::buffer::Bytes::Select;
use self::buffer::Bytes::Selected::*;
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
let mut buf_read = buffer::ByteReader::new(reader, newline_char);
let mut out = stdout();
let buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim = opts
.out_delim
.as_ref()
.map_or("", String::as_str)
.as_bytes();
'newline: loop {
let mut cur_pos = 1;
let res = buf_in.for_byte_record(newline_char, |line| {
let mut print_delim = false;
for &Range { low, high } in ranges.iter() {
// skip up to low
let orig_pos = cur_pos;
loop {
match buf_read.select(low - cur_pos, None::<&mut Stdout>) {
NewlineFound => {
crash_if_err!(1, out.write_all(&[newline_char]));
continue 'newline;
}
Complete(len) => {
cur_pos += len;
break;
}
Partial(len) => cur_pos += len,
EndOfFile => {
if orig_pos != cur_pos {
crash_if_err!(1, out.write_all(&[newline_char]));
}
break 'newline;
}
}
for &Range { low, high } in ranges {
if low > line.len() {
break;
}
if let Some(ref delim) = opts.out_delim {
if print_delim {
crash_if_err!(1, out.write_all(delim.as_bytes()));
}
if print_delim {
out.write_all(delim)?;
} else if opts.out_delim.is_some() {
print_delim = true;
}
// write out from low to high
loop {
match buf_read.select(high - cur_pos + 1, Some(&mut out)) {
NewlineFound => continue 'newline,
Partial(len) => cur_pos += len,
Complete(_) => {
cur_pos = high + 1;
break;
}
EndOfFile => {
if cur_pos != low || low == high {
crash_if_err!(1, out.write_all(&[newline_char]));
}
break 'newline;
}
}
}
// change `low` from 1-indexed value to 0-index value
let low = low - 1;
let high = high.min(line.len());
out.write_all(&line[low..high])?;
}
buf_read.consume_line();
crash_if_err!(1, out.write_all(&[newline_char]));
}
out.write_all(&[newline_char])?;
Ok(true)
});
crash_if_err!(1, res);
0
}
@ -212,23 +184,11 @@ fn cut_fields_delimiter<R: Read>(
newline_char: u8,
out_delim: &str,
) -> i32 {
let mut buf_in = BufReader::new(reader);
let mut out = stdout();
let mut buffer = Vec::new();
let buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let input_delim_len = delim.len();
'newline: loop {
buffer.clear();
match buf_in.read_until(newline_char, &mut buffer) {
Ok(n) if n == 0 => break,
Err(e) => {
if buffer.is_empty() {
crash!(1, "read error: {}", e);
}
}
_ => (),
}
let line = &buffer[..];
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable();
@ -236,46 +196,46 @@ fn cut_fields_delimiter<R: Read>(
if delim_search.peek().is_none() {
if !only_delimited {
crash_if_err!(1, out.write_all(line));
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
crash_if_err!(1, out.write_all(&[newline_char]));
out.write_all(&[newline_char])?;
}
}
continue;
return Ok(true);
}
for &Range { low, high } in ranges.iter() {
if low - fields_pos > 0 {
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some((_, beyond_delim)) => beyond_delim,
Some(index) => index + input_delim_len,
None => break,
};
}
for _ in 0..=high - low {
if print_delim {
crash_if_err!(1, out.write_all(out_delim.as_bytes()));
out.write_all(out_delim.as_bytes())?;
} else {
print_delim = true;
}
match delim_search.next() {
Some((high_idx, next_low_idx)) => {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
crash_if_err!(1, out.write_all(segment));
out.write_all(segment)?;
print_delim = true;
low_idx = next_low_idx;
low_idx = high_idx + input_delim_len;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..];
crash_if_err!(1, out.write_all(segment));
out.write_all(segment)?;
if line[line.len() - 1] == newline_char {
continue 'newline;
return Ok(true);
}
break;
}
@ -283,9 +243,10 @@ fn cut_fields_delimiter<R: Read>(
}
}
crash_if_err!(1, out.write_all(&[newline_char]));
}
out.write_all(&[newline_char])?;
Ok(true)
});
crash_if_err!(1, result);
0
}
@ -303,23 +264,11 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
);
}
let mut buf_in = BufReader::new(reader);
let mut out = stdout();
let mut buffer = Vec::new();
let buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim_len = opts.delimiter.len();
'newline: loop {
buffer.clear();
match buf_in.read_until(newline_char, &mut buffer) {
Ok(n) if n == 0 => break,
Err(e) => {
if buffer.is_empty() {
crash!(1, "read error: {}", e);
}
}
_ => (),
}
let line = &buffer[..];
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, opts.delimiter.as_bytes()).peekable();
@ -327,53 +276,54 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
if delim_search.peek().is_none() {
if !opts.only_delimited {
crash_if_err!(1, out.write_all(line));
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
crash_if_err!(1, out.write_all(&[newline_char]));
out.write_all(&[newline_char])?;
}
}
continue;
return Ok(true);
}
for &Range { low, high } in ranges.iter() {
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some((_, beyond_delim)) => beyond_delim,
None => break,
};
}
if print_delim && low_idx >= opts.delimiter.as_bytes().len() {
low_idx -= opts.delimiter.as_bytes().len();
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim {
delim_pos
} else {
delim_pos + delim_len
}
} else {
break;
}
}
match delim_search.nth(high - low) {
Some((high_idx, next_low_idx)) => {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
crash_if_err!(1, out.write_all(segment));
out.write_all(segment)?;
print_delim = true;
low_idx = next_low_idx;
low_idx = high_idx;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];
crash_if_err!(1, out.write_all(segment));
out.write_all(segment)?;
if line[line.len() - 1] == newline_char {
continue 'newline;
return Ok(true);
}
break;
}
}
}
crash_if_err!(1, out.write_all(&[newline_char]));
}
out.write_all(&[newline_char])?;
Ok(true)
});
crash_if_err!(1, result);
0
}

View file

@ -5,7 +5,8 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
#[derive(Clone)]
use memchr::memchr;
pub struct Searcher<'a> {
haystack: &'a [u8],
needle: &'a [u8],
@ -14,6 +15,7 @@ pub struct Searcher<'a> {
impl<'a> Searcher<'a> {
pub fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> {
assert!(!needle.is_empty());
Searcher {
haystack,
needle,
@ -23,30 +25,81 @@ impl<'a> Searcher<'a> {
}
impl<'a> Iterator for Searcher<'a> {
type Item = (usize, usize);
type Item = usize;
fn next(&mut self) -> Option<(usize, usize)> {
if self.needle.len() == 1 {
for offset in self.position..self.haystack.len() {
if self.haystack[offset] == self.needle[0] {
self.position = offset + 1;
return Some((offset, offset + 1));
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(match_idx) = memchr(self.needle[0], self.haystack) {
if self.needle.len() == 1
|| self.haystack[match_idx + 1..].starts_with(&self.needle[1..])
{
let skip = match_idx + self.needle.len();
self.haystack = &self.haystack[skip..];
let match_pos = self.position + match_idx;
self.position += skip;
return Some(match_pos);
} else {
let skip = match_idx + 1;
self.haystack = &self.haystack[skip..];
self.position += skip;
// continue
}
}
self.position = self.haystack.len();
return None;
}
while self.position + self.needle.len() <= self.haystack.len() {
if &self.haystack[self.position..self.position + self.needle.len()] == self.needle {
let match_pos = self.position;
self.position += self.needle.len();
return Some((match_pos, match_pos + self.needle.len()));
} else {
self.position += 1;
return None;
}
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
const NEEDLE: &[u8] = "ab".as_bytes();
#[test]
fn test_normal() {
let iter = Searcher::new("a.a.a".as_bytes(), "a".as_bytes());
let items: Vec<usize> = iter.collect();
assert_eq!(vec![0, 2, 4], items);
}
#[test]
fn test_empty() {
let iter = Searcher::new("".as_bytes(), "a".as_bytes());
let items: Vec<usize> = iter.collect();
assert_eq!(vec![] as Vec<usize>, items);
}
fn test_multibyte(line: &[u8], expected: Vec<usize>) {
let iter = Searcher::new(line, NEEDLE);
let items: Vec<usize> = iter.collect();
assert_eq!(expected, items);
}
#[test]
fn test_multibyte_normal() {
test_multibyte("...ab...ab...".as_bytes(), vec![3, 8]);
}
#[test]
fn test_multibyte_needle_head_at_end() {
test_multibyte("a".as_bytes(), vec![]);
}
#[test]
fn test_multibyte_starting_needle() {
test_multibyte("ab...ab...".as_bytes(), vec![0, 5]);
}
#[test]
fn test_multibyte_trailing_needle() {
test_multibyte("...ab...ab".as_bytes(), vec![3, 8]);
}
#[test]
fn test_multibyte_first_byte_false_match() {
test_multibyte("aA..aCaC..ab..aD".as_bytes(), vec![10]);
}
}