mirror of
https://github.com/uutils/coreutils
synced 2024-07-23 19:04:18 +00:00
wc: Optimize, improve correctness
- Reuse allocations for read lines - Increase splice size - Check if /dev/null was opened correctly - Do not discard read bytes after I/O error - Add fast line counting with bytecount
This commit is contained in:
parent
c756878b20
commit
48437fc49d
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -188,6 +188,12 @@ dependencies = [
|
|||
"utf8-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
|
@ -3110,6 +3116,7 @@ dependencies = [
|
|||
name = "uu_wc"
|
||||
version = "0.0.7"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"clap",
|
||||
"libc",
|
||||
"nix 0.20.0",
|
||||
|
|
|
@ -19,6 +19,7 @@ clap = { version = "2.33", features = ["wrap_help"] }
|
|||
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
||||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||
thiserror = "1.0"
|
||||
bytecount = "0.6.2"
|
||||
|
||||
[target.'cfg(unix)'.dependencies]
|
||||
nix = "0.20"
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
use crate::word_count::WordCount;
|
||||
|
||||
use super::{WcResult, WordCountable};
|
||||
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::ErrorKind;
|
||||
use std::io::{ErrorKind, Read};
|
||||
|
||||
#[cfg(unix)]
|
||||
use libc::S_IFREG;
|
||||
#[cfg(unix)]
|
||||
use nix::sys::stat::fstat;
|
||||
use nix::sys::stat;
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||
|
||||
|
@ -18,7 +20,8 @@ use nix::fcntl::{splice, SpliceFFlags};
|
|||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use nix::unistd::pipe;
|
||||
|
||||
const BUF_SIZE: usize = 16384;
|
||||
const BUF_SIZE: usize = 16 * 1024;
|
||||
const SPLICE_SIZE: usize = 128 * 1024;
|
||||
|
||||
/// Splice wrapper which handles short writes
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
|
@ -37,15 +40,24 @@ fn splice_exact(read_fd: RawFd, write_fd: RawFd, num_bytes: usize) -> nix::Resul
|
|||
|
||||
/// This is a Linux-specific function to count the number of bytes using the
|
||||
/// `splice` system call, which is faster than using `read`.
|
||||
///
|
||||
/// On error it returns the number of bytes it did manage to read, since the
|
||||
/// caller will fall back to a simpler method.
|
||||
#[inline]
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
||||
fn count_bytes_using_splice(fd: RawFd) -> Result<usize, usize> {
|
||||
let null_file = OpenOptions::new()
|
||||
.write(true)
|
||||
.open("/dev/null")
|
||||
.map_err(|_| nix::Error::last())?;
|
||||
.map_err(|_| 0_usize)?;
|
||||
let null = null_file.as_raw_fd();
|
||||
let (pipe_rd, pipe_wr) = pipe()?;
|
||||
let null_rdev = stat::fstat(null).map_err(|_| 0_usize)?.st_rdev;
|
||||
if (stat::major(null_rdev), stat::minor(null_rdev)) != (1, 3) {
|
||||
// This is not a proper /dev/null, writing to it is probably bad
|
||||
// Bit of an edge case, but it has been known to happen
|
||||
return Err(0);
|
||||
}
|
||||
let (pipe_rd, pipe_wr) = pipe().map_err(|_| 0_usize)?;
|
||||
|
||||
// Ensure the pipe is closed when the function returns.
|
||||
// SAFETY: The file descriptors do not have other owners.
|
||||
|
@ -53,12 +65,16 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
|||
|
||||
let mut byte_count = 0;
|
||||
loop {
|
||||
let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?;
|
||||
if res == 0 {
|
||||
break;
|
||||
}
|
||||
byte_count += res;
|
||||
splice_exact(pipe_rd, null, res)?;
|
||||
match splice(fd, None, pipe_wr, None, SPLICE_SIZE, SpliceFFlags::empty()) {
|
||||
Ok(0) => break,
|
||||
Ok(res) => {
|
||||
byte_count += res;
|
||||
if splice_exact(pipe_rd, null, res).is_err() {
|
||||
return Err(byte_count);
|
||||
}
|
||||
}
|
||||
Err(_) => return Err(byte_count),
|
||||
};
|
||||
}
|
||||
|
||||
Ok(byte_count)
|
||||
|
@ -73,10 +89,12 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
|||
/// other things such as lines and words.
|
||||
#[inline]
|
||||
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
|
||||
let mut byte_count = 0;
|
||||
|
||||
#[cfg(unix)]
|
||||
{
|
||||
let fd = handle.as_raw_fd();
|
||||
if let Ok(stat) = fstat(fd) {
|
||||
if let Ok(stat) = stat::fstat(fd) {
|
||||
// If the file is regular, then the `st_size` should hold
|
||||
// the file's size in bytes.
|
||||
if (stat.st_mode & S_IFREG) != 0 {
|
||||
|
@ -87,8 +105,9 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
|
|||
// Else, if we're on Linux and our file is a FIFO pipe
|
||||
// (or stdin), we use splice to count the number of bytes.
|
||||
if (stat.st_mode & S_IFIFO) != 0 {
|
||||
if let Ok(n) = count_bytes_using_splice(fd) {
|
||||
return Ok(n);
|
||||
match count_bytes_using_splice(fd) {
|
||||
Ok(n) => return Ok(n),
|
||||
Err(n) => byte_count = n,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -97,7 +116,6 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
|
|||
|
||||
// Fall back on `read`, but without the overhead of counting words and lines.
|
||||
let mut buf = [0_u8; BUF_SIZE];
|
||||
let mut byte_count = 0;
|
||||
loop {
|
||||
match handle.read(&mut buf) {
|
||||
Ok(0) => return Ok(byte_count),
|
||||
|
@ -109,3 +127,19 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn count_bytes_and_lines_fast<R: Read>(handle: &mut R) -> WcResult<WordCount> {
|
||||
let mut total = WordCount::default();
|
||||
let mut buf = [0; BUF_SIZE];
|
||||
loop {
|
||||
match handle.read(&mut buf) {
|
||||
Ok(0) => return Ok(total),
|
||||
Ok(n) => {
|
||||
total.bytes += n;
|
||||
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||
}
|
||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -28,7 +28,7 @@ impl WordCountable for StdinLock<'_> {
|
|||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Lines { buf: self }
|
||||
Lines::new(self)
|
||||
}
|
||||
}
|
||||
impl WordCountable for File {
|
||||
|
@ -38,9 +38,7 @@ impl WordCountable for File {
|
|||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Lines {
|
||||
buf: BufReader::new(self),
|
||||
}
|
||||
Lines::new(BufReader::new(self))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -53,19 +51,25 @@ impl WordCountable for File {
|
|||
/// [`io::Lines`]:: io::Lines
|
||||
pub struct Lines<B> {
|
||||
buf: B,
|
||||
line: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<B: BufRead> Iterator for Lines<B> {
|
||||
type Item = io::Result<Vec<u8>>;
|
||||
impl<B: BufRead> Lines<B> {
|
||||
fn new(reader: B) -> Self {
|
||||
Lines {
|
||||
buf: reader,
|
||||
line: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut line = Vec::new();
|
||||
pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
|
||||
self.line.clear();
|
||||
|
||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
||||
// hence the option wrapped in a result here
|
||||
match self.buf.read_until(b'\n', &mut line) {
|
||||
match self.buf.read_until(b'\n', &mut self.line) {
|
||||
Ok(0) => None,
|
||||
Ok(_n) => Some(Ok(line)),
|
||||
Ok(_n) => Some(Ok(&self.line)),
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,10 +8,10 @@
|
|||
#[macro_use]
|
||||
extern crate uucore;
|
||||
|
||||
mod count_bytes;
|
||||
mod count_fast;
|
||||
mod countable;
|
||||
mod word_count;
|
||||
use count_bytes::count_bytes_fast;
|
||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
|
||||
use countable::WordCountable;
|
||||
use word_count::{TitledWordCount, WordCount};
|
||||
|
||||
|
@ -220,19 +220,20 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
||||
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
||||
|
||||
if !decode_chars {
|
||||
return count_bytes_and_lines_fast(&mut reader);
|
||||
}
|
||||
|
||||
// Sum the WordCount for each line. Show a warning for each line
|
||||
// that results in an IO error when trying to read it.
|
||||
let total = reader
|
||||
.lines()
|
||||
.filter_map(|res| match res {
|
||||
Ok(line) => Some(line),
|
||||
Err(e) => {
|
||||
show_warning!("Error while reading {}: {}", path, e);
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(|line| WordCount::from_line(&line, decode_chars))
|
||||
.sum();
|
||||
let mut lines = reader.lines();
|
||||
let mut total = WordCount::default();
|
||||
while let Some(res) = lines.next() {
|
||||
match res {
|
||||
Ok(line) => total += WordCount::from_line(line),
|
||||
Err(e) => show_warning!("Error while reading {}: {}", path, e),
|
||||
}
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
|
|
|
@ -74,15 +74,11 @@ impl WordCount {
|
|||
/// fields will be set to 0. If it is `true`, this function will
|
||||
/// attempt to decode the bytes first as UTF-8, and failing that,
|
||||
/// as ASCII.
|
||||
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
|
||||
pub fn from_line(line: &[u8]) -> WordCount {
|
||||
// GNU 'wc' only counts lines that end in LF as lines
|
||||
let lines = (*line.last().unwrap() == LF) as usize;
|
||||
let bytes = line.len();
|
||||
let (words, chars) = if decode_chars {
|
||||
WordCount::word_and_char_count(line)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
let (words, chars) = WordCount::word_and_char_count(line);
|
||||
// -L is a GNU 'wc' extension so same behavior on LF
|
||||
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
||||
WordCount {
|
||||
|
|
Loading…
Reference in a new issue