wc: Optimize, improve correctness

- Reuse allocations for read lines
- Increase splice size
- Check if /dev/null was opened correctly
- Do not discard read bytes after I/O error
- Add fast line counting with bytecount
This commit is contained in:
Jan Verbeek 2021-08-25 11:24:00 +02:00 committed by Michael Debertol
parent c756878b20
commit 48437fc49d
6 changed files with 88 additions and 45 deletions

7
Cargo.lock generated
View file

@ -188,6 +188,12 @@ dependencies = [
"utf8-width",
]
[[package]]
name = "bytecount"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
[[package]]
name = "byteorder"
version = "1.4.3"
@ -3110,6 +3116,7 @@ dependencies = [
name = "uu_wc"
version = "0.0.7"
dependencies = [
"bytecount",
"clap",
"libc",
"nix 0.20.0",

View file

@ -19,6 +19,7 @@ clap = { version = "2.33", features = ["wrap_help"] }
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
thiserror = "1.0"
bytecount = "0.6.2"
[target.'cfg(unix)'.dependencies]
nix = "0.20"

View file

@ -1,13 +1,15 @@
use crate::word_count::WordCount;
use super::{WcResult, WordCountable};
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::fs::{File, OpenOptions};
use std::io::ErrorKind;
use std::io::{ErrorKind, Read};
#[cfg(unix)]
use libc::S_IFREG;
#[cfg(unix)]
use nix::sys::stat::fstat;
use nix::sys::stat;
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
@ -18,7 +20,8 @@ use nix::fcntl::{splice, SpliceFFlags};
#[cfg(any(target_os = "linux", target_os = "android"))]
use nix::unistd::pipe;
const BUF_SIZE: usize = 16384;
const BUF_SIZE: usize = 16 * 1024;
const SPLICE_SIZE: usize = 128 * 1024;
/// Splice wrapper which handles short writes
#[cfg(any(target_os = "linux", target_os = "android"))]
@ -37,15 +40,24 @@ fn splice_exact(read_fd: RawFd, write_fd: RawFd, num_bytes: usize) -> nix::Resul
/// This is a Linux-specific function to count the number of bytes using the
/// `splice` system call, which is faster than using `read`.
///
/// On error it returns the number of bytes it did manage to read, since the
/// caller will fall back to a simpler method.
#[inline]
#[cfg(any(target_os = "linux", target_os = "android"))]
fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
fn count_bytes_using_splice(fd: RawFd) -> Result<usize, usize> {
let null_file = OpenOptions::new()
.write(true)
.open("/dev/null")
.map_err(|_| nix::Error::last())?;
.map_err(|_| 0_usize)?;
let null = null_file.as_raw_fd();
let (pipe_rd, pipe_wr) = pipe()?;
let null_rdev = stat::fstat(null).map_err(|_| 0_usize)?.st_rdev;
if (stat::major(null_rdev), stat::minor(null_rdev)) != (1, 3) {
// This is not a proper /dev/null, writing to it is probably bad
// Bit of an edge case, but it has been known to happen
return Err(0);
}
let (pipe_rd, pipe_wr) = pipe().map_err(|_| 0_usize)?;
// Ensure the pipe is closed when the function returns.
// SAFETY: The file descriptors do not have other owners.
@ -53,12 +65,16 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
let mut byte_count = 0;
loop {
let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?;
if res == 0 {
break;
}
byte_count += res;
splice_exact(pipe_rd, null, res)?;
match splice(fd, None, pipe_wr, None, SPLICE_SIZE, SpliceFFlags::empty()) {
Ok(0) => break,
Ok(res) => {
byte_count += res;
if splice_exact(pipe_rd, null, res).is_err() {
return Err(byte_count);
}
}
Err(_) => return Err(byte_count),
};
}
Ok(byte_count)
@ -73,10 +89,12 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
/// other things such as lines and words.
#[inline]
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
let mut byte_count = 0;
#[cfg(unix)]
{
let fd = handle.as_raw_fd();
if let Ok(stat) = fstat(fd) {
if let Ok(stat) = stat::fstat(fd) {
// If the file is regular, then the `st_size` should hold
// the file's size in bytes.
if (stat.st_mode & S_IFREG) != 0 {
@ -87,8 +105,9 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
// Else, if we're on Linux and our file is a FIFO pipe
// (or stdin), we use splice to count the number of bytes.
if (stat.st_mode & S_IFIFO) != 0 {
if let Ok(n) = count_bytes_using_splice(fd) {
return Ok(n);
match count_bytes_using_splice(fd) {
Ok(n) => return Ok(n),
Err(n) => byte_count = n,
}
}
}
@ -97,7 +116,6 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
// Fall back on `read`, but without the overhead of counting words and lines.
let mut buf = [0_u8; BUF_SIZE];
let mut byte_count = 0;
loop {
match handle.read(&mut buf) {
Ok(0) => return Ok(byte_count),
@ -109,3 +127,19 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
}
}
}
pub(crate) fn count_bytes_and_lines_fast<R: Read>(handle: &mut R) -> WcResult<WordCount> {
let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE];
loop {
match handle.read(&mut buf) {
Ok(0) => return Ok(total),
Ok(n) => {
total.bytes += n;
total.lines += bytecount::count(&buf[..n], b'\n');
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(e.into()),
}
}
}

View file

@ -28,7 +28,7 @@ impl WordCountable for StdinLock<'_> {
where
Self: Sized,
{
Lines { buf: self }
Lines::new(self)
}
}
impl WordCountable for File {
@ -38,9 +38,7 @@ impl WordCountable for File {
where
Self: Sized,
{
Lines {
buf: BufReader::new(self),
}
Lines::new(BufReader::new(self))
}
}
@ -53,19 +51,25 @@ impl WordCountable for File {
/// [`io::Lines`]:: io::Lines
pub struct Lines<B> {
buf: B,
line: Vec<u8>,
}
impl<B: BufRead> Iterator for Lines<B> {
type Item = io::Result<Vec<u8>>;
impl<B: BufRead> Lines<B> {
fn new(reader: B) -> Self {
Lines {
buf: reader,
line: Vec::new(),
}
}
fn next(&mut self) -> Option<Self::Item> {
let mut line = Vec::new();
pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
self.line.clear();
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
// hence the option wrapped in a result here
match self.buf.read_until(b'\n', &mut line) {
match self.buf.read_until(b'\n', &mut self.line) {
Ok(0) => None,
Ok(_n) => Some(Ok(line)),
Ok(_n) => Some(Ok(&self.line)),
Err(e) => Some(Err(e)),
}
}

View file

@ -8,10 +8,10 @@
#[macro_use]
extern crate uucore;
mod count_bytes;
mod count_fast;
mod countable;
mod word_count;
use count_bytes::count_bytes_fast;
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
use countable::WordCountable;
use word_count::{TitledWordCount, WordCount};
@ -220,19 +220,20 @@ fn word_count_from_reader<T: WordCountable>(
// we do not need to decode the byte stream if we're only counting bytes/newlines
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
if !decode_chars {
return count_bytes_and_lines_fast(&mut reader);
}
// Sum the WordCount for each line. Show a warning for each line
// that results in an IO error when trying to read it.
let total = reader
.lines()
.filter_map(|res| match res {
Ok(line) => Some(line),
Err(e) => {
show_warning!("Error while reading {}: {}", path, e);
None
}
})
.map(|line| WordCount::from_line(&line, decode_chars))
.sum();
let mut lines = reader.lines();
let mut total = WordCount::default();
while let Some(res) = lines.next() {
match res {
Ok(line) => total += WordCount::from_line(line),
Err(e) => show_warning!("Error while reading {}: {}", path, e),
}
}
Ok(total)
}

View file

@ -74,15 +74,11 @@ impl WordCount {
/// fields will be set to 0. If it is `true`, this function will
/// attempt to decode the bytes first as UTF-8, and failing that,
/// as ASCII.
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
pub fn from_line(line: &[u8]) -> WordCount {
// GNU 'wc' only counts lines that end in LF as lines
let lines = (*line.last().unwrap() == LF) as usize;
let bytes = line.len();
let (words, chars) = if decode_chars {
WordCount::word_and_char_count(line)
} else {
(0, 0)
};
let (words, chars) = WordCount::word_and_char_count(line);
// -L is a GNU 'wc' extension so same behavior on LF
let max_line_length = if chars > 0 { chars - lines } else { 0 };
WordCount {