wc: Don't read() if we only need to count number of bytes (Version 2) (#1851)

* wc: Don't read() if we only need to count number of bytes

* Resolve a few code review comments

* Use write macros instead of print

* Fix wc tests in case only one thing is printed

* wc: Fix style

* wc: Use return value of first splice rather than second

* wc: Make main loop more readable

* wc: Don't unwrap on failed write to stdout

* wc: Increment error count when stats fail to print

* Re-add Cargo.lock
This commit is contained in:
Árni Dagur 2021-03-30 18:53:02 +00:00 committed by GitHub
parent 3e06882acc
commit 698dab12a6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 1382 additions and 1152 deletions

2017
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -18,6 +18,11 @@ path = "src/wc.rs"
clap = "2.33"
uucore = { version=">=0.0.7", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
thiserror = "1.0"
[target.'cfg(unix)'.dependencies]
nix = "0.20"
libc = "0.2"
[[bin]]
name = "wc"

View file

@ -0,0 +1,95 @@
use super::{WcResult, WordCountable};
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::fs::OpenOptions;
use std::io::ErrorKind;
#[cfg(unix)]
use libc::S_IFREG;
#[cfg(unix)]
use nix::sys::stat::fstat;
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::os::unix::io::{AsRawFd, RawFd};
#[cfg(any(target_os = "linux", target_os = "android"))]
use libc::S_IFIFO;
#[cfg(any(target_os = "linux", target_os = "android"))]
use nix::fcntl::{splice, SpliceFFlags};
#[cfg(any(target_os = "linux", target_os = "android"))]
use nix::unistd::pipe;
const BUF_SIZE: usize = 16384;
/// This is a Linux-specific function to count the number of bytes using the
/// `splice` system call, which is faster than using `read`.
#[inline]
#[cfg(any(target_os = "linux", target_os = "android"))]
fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
let null_file = OpenOptions::new()
.write(true)
.open("/dev/null")
.map_err(|_| nix::Error::last())?;
let null = null_file.as_raw_fd();
let (pipe_rd, pipe_wr) = pipe()?;
let mut byte_count = 0;
loop {
let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?;
if res == 0 {
break;
}
byte_count += res;
splice(pipe_rd, None, null, None, res, SpliceFFlags::empty())?;
}
Ok(byte_count)
}
/// In the special case where we only need to count the number of bytes. There
/// are several optimizations we can do:
/// 1. On Unix, we can simply `stat` the file if it is regular.
/// 2. On Linux -- if the above did not work -- we can use splice to count
/// the number of bytes if the file is a FIFO.
/// 3. Otherwise, we just read normally, but without the overhead of counting
/// other things such as lines and words.
#[inline]
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
#[cfg(unix)]
{
let fd = handle.as_raw_fd();
match fstat(fd) {
Ok(stat) => {
// If the file is regular, then the `st_size` should hold
// the file's size in bytes.
if (stat.st_mode & S_IFREG) != 0 {
return Ok(stat.st_size as usize);
}
#[cfg(any(target_os = "linux", target_os = "android"))]
{
// Else, if we're on Linux and our file is a FIFO pipe
// (or stdin), we use splice to count the number of bytes.
if (stat.st_mode & S_IFIFO) != 0 {
if let Ok(n) = count_bytes_using_splice(fd) {
return Ok(n);
}
}
}
}
_ => {}
}
}
// Fall back on `read`, but without the overhead of counting words and lines.
let mut buf = [0 as u8; BUF_SIZE];
let mut byte_count = 0;
loop {
match handle.read(&mut buf) {
Ok(0) => return Ok(byte_count),
Ok(n) => {
byte_count += n;
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(e.into()),
}
}
}

View file

@ -10,14 +10,31 @@
#[macro_use]
extern crate uucore;
use clap::{App, Arg, ArgMatches};
mod count_bytes;
use count_bytes::count_bytes_fast;
use clap::{App, Arg, ArgMatches};
use thiserror::Error;
use std::cmp::max;
use std::fs::File;
use std::io::{stdin, BufRead, BufReader, Read};
use std::io::{self, BufRead, BufReader, Read, StdinLock, Write};
use std::ops::{Add, AddAssign};
#[cfg(unix)]
use std::os::unix::io::AsRawFd;
use std::path::Path;
use std::result::Result as StdResult;
use std::str::from_utf8;
#[derive(Error, Debug)]
pub enum WcError {
#[error("{0}")]
Io(#[from] io::Error),
#[error("Expected a file, found directory {0}")]
IsDirectory(String),
}
type WcResult<T> = Result<T, WcError>;
struct Settings {
show_bytes: bool,
show_chars: bool,
@ -53,10 +70,46 @@ impl Settings {
show_max_line_length: false,
}
}
fn number_enabled(&self) -> u32 {
let mut result = 0;
result += self.show_bytes as u32;
result += self.show_chars as u32;
result += self.show_lines as u32;
result += self.show_max_line_length as u32;
result += self.show_words as u32;
result
}
}
struct Result {
title: String,
#[cfg(unix)]
trait WordCountable: AsRawFd + Read {
type Buffered: BufRead;
fn get_buffered(self) -> Self::Buffered;
}
#[cfg(not(unix))]
trait WordCountable: Read {
type Buffered: BufRead;
fn get_buffered(self) -> Self::Buffered;
}
impl WordCountable for StdinLock<'_> {
type Buffered = Self;
fn get_buffered(self) -> Self::Buffered {
self
}
}
impl WordCountable for File {
type Buffered = BufReader<Self>;
fn get_buffered(self) -> Self::Buffered {
BufReader::new(self)
}
}
#[derive(Debug, Default, Copy, Clone)]
struct WordCount {
bytes: usize,
chars: usize,
lines: usize,
@ -64,6 +117,45 @@ struct Result {
max_line_length: usize,
}
impl Add for WordCount {
type Output = Self;
fn add(self, other: Self) -> Self {
Self {
bytes: self.bytes + other.bytes,
chars: self.chars + other.chars,
lines: self.lines + other.lines,
words: self.words + other.words,
max_line_length: max(self.max_line_length, other.max_line_length),
}
}
}
impl AddAssign for WordCount {
fn add_assign(&mut self, other: Self) {
*self = *self + other
}
}
impl WordCount {
fn with_title<'a>(self, title: &'a str) -> TitledWordCount<'a> {
return TitledWordCount {
title: title,
count: self,
};
}
}
/// This struct supplements the actual word count with a title that is displayed
/// to the user at the end of the program.
/// The reason we don't simply include title in the `WordCount` struct is that
/// it would result in unneccesary copying of `String`.
#[derive(Debug, Default, Clone)]
struct TitledWordCount<'a> {
title: &'a str,
count: WordCount,
}
static ABOUT: &str = "Display newline, word, and byte counts for each FILE, and a total line if
more than one FILE is specified.";
static VERSION: &str = env!("CARGO_PKG_VERSION");
@ -137,12 +229,11 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
let settings = Settings::new(&matches);
match wc(files, &settings) {
Ok(()) => ( /* pass */ ),
Err(e) => return e,
if wc(files, &settings).is_ok() {
0
} else {
1
}
0
}
const CR: u8 = b'\r';
@ -157,151 +248,187 @@ fn is_word_separator(byte: u8) -> bool {
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
}
fn wc(files: Vec<String>, settings: &Settings) -> StdResult<(), i32> {
let mut total_line_count: usize = 0;
let mut total_word_count: usize = 0;
let mut total_char_count: usize = 0;
let mut total_byte_count: usize = 0;
let mut total_longest_line_length: usize = 0;
let mut results = vec![];
let mut max_width: usize = 0;
fn word_count_from_reader<T: WordCountable>(
mut reader: T,
settings: &Settings,
path: &String,
) -> WcResult<WordCount> {
let only_count_bytes = settings.show_bytes
&& (!(settings.show_chars
|| settings.show_lines
|| settings.show_max_line_length
|| settings.show_words));
if only_count_bytes {
return Ok(WordCount {
bytes: count_bytes_fast(&mut reader)?,
..WordCount::default()
});
}
// we do not need to decode the byte stream if we're only counting bytes/newlines
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
let mut line_count: usize = 0;
let mut word_count: usize = 0;
let mut byte_count: usize = 0;
let mut char_count: usize = 0;
let mut longest_line_length: usize = 0;
let mut raw_line = Vec::new();
let mut ends_lf: bool;
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
// hence the option wrapped in a result here
let mut buffered_reader = reader.get_buffered();
loop {
match buffered_reader.read_until(LF, &mut raw_line) {
Ok(n) => {
if n == 0 {
break;
}
}
Err(ref e) => {
if !raw_line.is_empty() {
show_warning!("Error while reading {}: {}", path, e);
} else {
break;
}
}
};
// GNU 'wc' only counts lines that end in LF as lines
ends_lf = *raw_line.last().unwrap() == LF;
line_count += ends_lf as usize;
byte_count += raw_line.len();
if decode_chars {
// try and convert the bytes to UTF-8 first
let current_char_count;
match from_utf8(&raw_line[..]) {
Ok(line) => {
word_count += line.split_whitespace().count();
current_char_count = line.chars().count();
}
Err(..) => {
word_count += raw_line.split(|&x| is_word_separator(x)).count();
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
}
}
char_count += current_char_count;
if current_char_count > longest_line_length {
// -L is a GNU 'wc' extension so same behavior on LF
longest_line_length = current_char_count - (ends_lf as usize);
}
}
raw_line.truncate(0);
}
Ok(WordCount {
bytes: byte_count,
chars: char_count,
lines: line_count,
words: word_count,
max_line_length: longest_line_length,
})
}
fn word_count_from_path(path: &String, settings: &Settings) -> WcResult<WordCount> {
if path == "-" {
let stdin = io::stdin();
let stdin_lock = stdin.lock();
return Ok(word_count_from_reader(stdin_lock, settings, path)?);
} else {
let path_obj = Path::new(path);
if path_obj.is_dir() {
return Err(WcError::IsDirectory(path.clone()));
} else {
let file = File::open(path)?;
return Ok(word_count_from_reader(file, settings, path)?);
}
}
}
fn wc(files: Vec<String>, settings: &Settings) -> Result<(), u32> {
let mut total_word_count = WordCount::default();
let mut results = vec![];
let mut max_width: usize = 0;
let mut error_count = 0;
let num_files = files.len();
for path in &files {
let mut reader = open(&path[..])?;
let mut line_count: usize = 0;
let mut word_count: usize = 0;
let mut byte_count: usize = 0;
let mut char_count: usize = 0;
let mut longest_line_length: usize = 0;
let mut raw_line = Vec::new();
let mut ends_lf: bool;
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
// hence the option wrapped in a result here
while match reader.read_until(LF, &mut raw_line) {
Ok(n) if n > 0 => true,
Err(ref e) if !raw_line.is_empty() => {
show_warning!("Error while reading {}: {}", path, e);
!raw_line.is_empty()
}
_ => false,
} {
// GNU 'wc' only counts lines that end in LF as lines
ends_lf = *raw_line.last().unwrap() == LF;
line_count += ends_lf as usize;
byte_count += raw_line.len();
if decode_chars {
// try and convert the bytes to UTF-8 first
let current_char_count;
match from_utf8(&raw_line[..]) {
Ok(line) => {
word_count += line.split_whitespace().count();
current_char_count = line.chars().count();
}
Err(..) => {
word_count += raw_line.split(|&x| is_word_separator(x)).count();
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
}
}
char_count += current_char_count;
if current_char_count > longest_line_length {
// -L is a GNU 'wc' extension so same behavior on LF
longest_line_length = current_char_count - (ends_lf as usize);
}
}
raw_line.truncate(0);
}
results.push(Result {
title: path.clone(),
bytes: byte_count,
chars: char_count,
lines: line_count,
words: word_count,
max_line_length: longest_line_length,
let word_count = word_count_from_path(&path, settings).unwrap_or_else(|err| {
show_error!("{}", err);
error_count += 1;
WordCount::default()
});
total_line_count += line_count;
max_width = max(max_width, word_count.bytes.to_string().len() + 1);
total_word_count += word_count;
total_char_count += char_count;
total_byte_count += byte_count;
if longest_line_length > total_longest_line_length {
total_longest_line_length = longest_line_length;
}
// used for formatting
max_width = total_byte_count.to_string().len() + 1;
results.push(word_count.with_title(path));
}
for result in &results {
print_stats(settings, &result, max_width);
if let Err(err) = print_stats(settings, &result, max_width) {
show_warning!("failed to print result for {}: {}", result.title, err);
error_count += 1;
}
}
if files.len() > 1 {
let result = Result {
title: "total".to_owned(),
bytes: total_byte_count,
chars: total_char_count,
lines: total_line_count,
words: total_word_count,
max_line_length: total_longest_line_length,
};
print_stats(settings, &result, max_width);
if num_files > 1 {
let total_result = total_word_count.with_title("total");
if let Err(err) = print_stats(settings, &total_result, max_width) {
show_warning!("failed to print total: {}", err);
error_count += 1;
}
}
if error_count == 0 {
Ok(())
} else {
Err(error_count)
}
}
fn print_stats(
settings: &Settings,
result: &TitledWordCount,
mut min_width: usize,
) -> WcResult<()> {
let stdout = io::stdout();
let mut stdout_lock = stdout.lock();
if settings.number_enabled() <= 1 {
// Prevent a leading space in case we only need to display a single
// number.
min_width = 0;
}
if settings.show_lines {
write!(stdout_lock, "{:1$}", result.count.lines, min_width)?;
}
if settings.show_words {
write!(stdout_lock, "{:1$}", result.count.words, min_width)?;
}
if settings.show_bytes {
write!(stdout_lock, "{:1$}", result.count.bytes, min_width)?;
}
if settings.show_chars {
write!(stdout_lock, "{:1$}", result.count.chars, min_width)?;
}
if settings.show_max_line_length {
write!(
stdout_lock,
"{:1$}",
result.count.max_line_length, min_width
)?;
}
if result.title == "-" {
writeln!(stdout_lock, "")?;
} else {
writeln!(stdout_lock, " {}", result.title)?;
}
Ok(())
}
fn print_stats(settings: &Settings, result: &Result, max_width: usize) {
if settings.show_lines {
print!("{:1$}", result.lines, max_width);
}
if settings.show_words {
print!("{:1$}", result.words, max_width);
}
if settings.show_bytes {
print!("{:1$}", result.bytes, max_width);
}
if settings.show_chars {
print!("{:1$}", result.chars, max_width);
}
if settings.show_max_line_length {
print!("{:1$}", result.max_line_length, max_width);
}
if result.title != "-" {
println!(" {}", result.title);
} else {
println!();
}
}
fn open(path: &str) -> StdResult<BufReader<Box<dyn Read + 'static>>, i32> {
if "-" == path {
let reader = Box::new(stdin()) as Box<dyn Read>;
return Ok(BufReader::new(reader));
}
let fpath = Path::new(path);
if fpath.is_dir() {
show_info!("{}: is a directory", path);
}
match File::open(&fpath) {
Ok(fd) => {
let reader = Box::new(fd) as Box<dyn Read>;
Ok(BufReader::new(reader))
}
Err(e) => {
show_error!("wc: {}: {}", path, e);
Err(1)
}
}
}

View file

@ -25,7 +25,7 @@ fn test_stdin_line_len_regression() {
.args(&["-L"])
.pipe_in("\n123456")
.run()
.stdout_is(" 6\n");
.stdout_is("6\n");
}
#[test]
@ -34,7 +34,7 @@ fn test_stdin_only_bytes() {
.args(&["-c"])
.pipe_in_fixture("lorem_ipsum.txt")
.run()
.stdout_is(" 772\n");
.stdout_is("772\n");
}
#[test]
@ -59,7 +59,7 @@ fn test_single_only_lines() {
new_ucmd!()
.args(&["-l", "moby_dick.txt"])
.run()
.stdout_is(" 18 moby_dick.txt\n");
.stdout_is("18 moby_dick.txt\n");
}
#[test]