mirror of
https://github.com/uutils/coreutils
synced 2024-07-23 10:54:14 +00:00
wc: Don't read() if we only need to count number of bytes (Version 2) (#1851)
* wc: Don't read() if we only need to count number of bytes * Resolve a few code review comments * Use write macros instead of print * Fix wc tests in case only one thing is printed * wc: Fix style * wc: Use return value of first splice rather than second * wc: Make main loop more readable * wc: Don't unwrap on failed write to stdout * wc: Increment error count when stats fail to print * Re-add Cargo.lock
This commit is contained in:
parent
3e06882acc
commit
698dab12a6
2017
Cargo.lock
generated
2017
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -18,6 +18,11 @@ path = "src/wc.rs"
|
|||
clap = "2.33"
|
||||
uucore = { version=">=0.0.7", package="uucore", path="../../uucore" }
|
||||
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
||||
thiserror = "1.0"
|
||||
|
||||
[target.'cfg(unix)'.dependencies]
|
||||
nix = "0.20"
|
||||
libc = "0.2"
|
||||
|
||||
[[bin]]
|
||||
name = "wc"
|
||||
|
|
95
src/uu/wc/src/count_bytes.rs
Normal file
95
src/uu/wc/src/count_bytes.rs
Normal file
|
@ -0,0 +1,95 @@
|
|||
use super::{WcResult, WordCountable};
|
||||
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::ErrorKind;
|
||||
|
||||
#[cfg(unix)]
|
||||
use libc::S_IFREG;
|
||||
#[cfg(unix)]
|
||||
use nix::sys::stat::fstat;
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use libc::S_IFIFO;
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use nix::fcntl::{splice, SpliceFFlags};
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
use nix::unistd::pipe;
|
||||
|
||||
const BUF_SIZE: usize = 16384;
|
||||
|
||||
/// This is a Linux-specific function to count the number of bytes using the
|
||||
/// `splice` system call, which is faster than using `read`.
|
||||
#[inline]
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
||||
let null_file = OpenOptions::new()
|
||||
.write(true)
|
||||
.open("/dev/null")
|
||||
.map_err(|_| nix::Error::last())?;
|
||||
let null = null_file.as_raw_fd();
|
||||
let (pipe_rd, pipe_wr) = pipe()?;
|
||||
|
||||
let mut byte_count = 0;
|
||||
loop {
|
||||
let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?;
|
||||
if res == 0 {
|
||||
break;
|
||||
}
|
||||
byte_count += res;
|
||||
splice(pipe_rd, None, null, None, res, SpliceFFlags::empty())?;
|
||||
}
|
||||
|
||||
Ok(byte_count)
|
||||
}
|
||||
|
||||
/// In the special case where we only need to count the number of bytes. There
|
||||
/// are several optimizations we can do:
|
||||
/// 1. On Unix, we can simply `stat` the file if it is regular.
|
||||
/// 2. On Linux -- if the above did not work -- we can use splice to count
|
||||
/// the number of bytes if the file is a FIFO.
|
||||
/// 3. Otherwise, we just read normally, but without the overhead of counting
|
||||
/// other things such as lines and words.
|
||||
#[inline]
|
||||
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
|
||||
#[cfg(unix)]
|
||||
{
|
||||
let fd = handle.as_raw_fd();
|
||||
match fstat(fd) {
|
||||
Ok(stat) => {
|
||||
// If the file is regular, then the `st_size` should hold
|
||||
// the file's size in bytes.
|
||||
if (stat.st_mode & S_IFREG) != 0 {
|
||||
return Ok(stat.st_size as usize);
|
||||
}
|
||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||
{
|
||||
// Else, if we're on Linux and our file is a FIFO pipe
|
||||
// (or stdin), we use splice to count the number of bytes.
|
||||
if (stat.st_mode & S_IFIFO) != 0 {
|
||||
if let Ok(n) = count_bytes_using_splice(fd) {
|
||||
return Ok(n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back on `read`, but without the overhead of counting words and lines.
|
||||
let mut buf = [0 as u8; BUF_SIZE];
|
||||
let mut byte_count = 0;
|
||||
loop {
|
||||
match handle.read(&mut buf) {
|
||||
Ok(0) => return Ok(byte_count),
|
||||
Ok(n) => {
|
||||
byte_count += n;
|
||||
}
|
||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -10,14 +10,31 @@
|
|||
#[macro_use]
|
||||
extern crate uucore;
|
||||
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
mod count_bytes;
|
||||
use count_bytes::count_bytes_fast;
|
||||
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use thiserror::Error;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::fs::File;
|
||||
use std::io::{stdin, BufRead, BufReader, Read};
|
||||
use std::io::{self, BufRead, BufReader, Read, StdinLock, Write};
|
||||
use std::ops::{Add, AddAssign};
|
||||
#[cfg(unix)]
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::path::Path;
|
||||
use std::result::Result as StdResult;
|
||||
use std::str::from_utf8;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum WcError {
|
||||
#[error("{0}")]
|
||||
Io(#[from] io::Error),
|
||||
#[error("Expected a file, found directory {0}")]
|
||||
IsDirectory(String),
|
||||
}
|
||||
|
||||
type WcResult<T> = Result<T, WcError>;
|
||||
|
||||
struct Settings {
|
||||
show_bytes: bool,
|
||||
show_chars: bool,
|
||||
|
@ -53,10 +70,46 @@ impl Settings {
|
|||
show_max_line_length: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn number_enabled(&self) -> u32 {
|
||||
let mut result = 0;
|
||||
result += self.show_bytes as u32;
|
||||
result += self.show_chars as u32;
|
||||
result += self.show_lines as u32;
|
||||
result += self.show_max_line_length as u32;
|
||||
result += self.show_words as u32;
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
struct Result {
|
||||
title: String,
|
||||
#[cfg(unix)]
|
||||
trait WordCountable: AsRawFd + Read {
|
||||
type Buffered: BufRead;
|
||||
fn get_buffered(self) -> Self::Buffered;
|
||||
}
|
||||
#[cfg(not(unix))]
|
||||
trait WordCountable: Read {
|
||||
type Buffered: BufRead;
|
||||
fn get_buffered(self) -> Self::Buffered;
|
||||
}
|
||||
|
||||
impl WordCountable for StdinLock<'_> {
|
||||
type Buffered = Self;
|
||||
|
||||
fn get_buffered(self) -> Self::Buffered {
|
||||
self
|
||||
}
|
||||
}
|
||||
impl WordCountable for File {
|
||||
type Buffered = BufReader<Self>;
|
||||
|
||||
fn get_buffered(self) -> Self::Buffered {
|
||||
BufReader::new(self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Copy, Clone)]
|
||||
struct WordCount {
|
||||
bytes: usize,
|
||||
chars: usize,
|
||||
lines: usize,
|
||||
|
@ -64,6 +117,45 @@ struct Result {
|
|||
max_line_length: usize,
|
||||
}
|
||||
|
||||
impl Add for WordCount {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, other: Self) -> Self {
|
||||
Self {
|
||||
bytes: self.bytes + other.bytes,
|
||||
chars: self.chars + other.chars,
|
||||
lines: self.lines + other.lines,
|
||||
words: self.words + other.words,
|
||||
max_line_length: max(self.max_line_length, other.max_line_length),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign for WordCount {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
*self = *self + other
|
||||
}
|
||||
}
|
||||
|
||||
impl WordCount {
|
||||
fn with_title<'a>(self, title: &'a str) -> TitledWordCount<'a> {
|
||||
return TitledWordCount {
|
||||
title: title,
|
||||
count: self,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct supplements the actual word count with a title that is displayed
|
||||
/// to the user at the end of the program.
|
||||
/// The reason we don't simply include title in the `WordCount` struct is that
|
||||
/// it would result in unneccesary copying of `String`.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct TitledWordCount<'a> {
|
||||
title: &'a str,
|
||||
count: WordCount,
|
||||
}
|
||||
|
||||
static ABOUT: &str = "Display newline, word, and byte counts for each FILE, and a total line if
|
||||
more than one FILE is specified.";
|
||||
static VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
@ -137,12 +229,11 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
|
||||
let settings = Settings::new(&matches);
|
||||
|
||||
match wc(files, &settings) {
|
||||
Ok(()) => ( /* pass */ ),
|
||||
Err(e) => return e,
|
||||
if wc(files, &settings).is_ok() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
const CR: u8 = b'\r';
|
||||
|
@ -157,151 +248,187 @@ fn is_word_separator(byte: u8) -> bool {
|
|||
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
||||
}
|
||||
|
||||
fn wc(files: Vec<String>, settings: &Settings) -> StdResult<(), i32> {
|
||||
let mut total_line_count: usize = 0;
|
||||
let mut total_word_count: usize = 0;
|
||||
let mut total_char_count: usize = 0;
|
||||
let mut total_byte_count: usize = 0;
|
||||
let mut total_longest_line_length: usize = 0;
|
||||
|
||||
let mut results = vec![];
|
||||
let mut max_width: usize = 0;
|
||||
fn word_count_from_reader<T: WordCountable>(
|
||||
mut reader: T,
|
||||
settings: &Settings,
|
||||
path: &String,
|
||||
) -> WcResult<WordCount> {
|
||||
let only_count_bytes = settings.show_bytes
|
||||
&& (!(settings.show_chars
|
||||
|| settings.show_lines
|
||||
|| settings.show_max_line_length
|
||||
|| settings.show_words));
|
||||
if only_count_bytes {
|
||||
return Ok(WordCount {
|
||||
bytes: count_bytes_fast(&mut reader)?,
|
||||
..WordCount::default()
|
||||
});
|
||||
}
|
||||
|
||||
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
||||
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
||||
|
||||
let mut line_count: usize = 0;
|
||||
let mut word_count: usize = 0;
|
||||
let mut byte_count: usize = 0;
|
||||
let mut char_count: usize = 0;
|
||||
let mut longest_line_length: usize = 0;
|
||||
let mut raw_line = Vec::new();
|
||||
let mut ends_lf: bool;
|
||||
|
||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
||||
// hence the option wrapped in a result here
|
||||
let mut buffered_reader = reader.get_buffered();
|
||||
loop {
|
||||
match buffered_reader.read_until(LF, &mut raw_line) {
|
||||
Ok(n) => {
|
||||
if n == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(ref e) => {
|
||||
if !raw_line.is_empty() {
|
||||
show_warning!("Error while reading {}: {}", path, e);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GNU 'wc' only counts lines that end in LF as lines
|
||||
ends_lf = *raw_line.last().unwrap() == LF;
|
||||
line_count += ends_lf as usize;
|
||||
|
||||
byte_count += raw_line.len();
|
||||
|
||||
if decode_chars {
|
||||
// try and convert the bytes to UTF-8 first
|
||||
let current_char_count;
|
||||
match from_utf8(&raw_line[..]) {
|
||||
Ok(line) => {
|
||||
word_count += line.split_whitespace().count();
|
||||
current_char_count = line.chars().count();
|
||||
}
|
||||
Err(..) => {
|
||||
word_count += raw_line.split(|&x| is_word_separator(x)).count();
|
||||
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
|
||||
}
|
||||
}
|
||||
char_count += current_char_count;
|
||||
if current_char_count > longest_line_length {
|
||||
// -L is a GNU 'wc' extension so same behavior on LF
|
||||
longest_line_length = current_char_count - (ends_lf as usize);
|
||||
}
|
||||
}
|
||||
|
||||
raw_line.truncate(0);
|
||||
}
|
||||
|
||||
Ok(WordCount {
|
||||
bytes: byte_count,
|
||||
chars: char_count,
|
||||
lines: line_count,
|
||||
words: word_count,
|
||||
max_line_length: longest_line_length,
|
||||
})
|
||||
}
|
||||
|
||||
fn word_count_from_path(path: &String, settings: &Settings) -> WcResult<WordCount> {
|
||||
if path == "-" {
|
||||
let stdin = io::stdin();
|
||||
let stdin_lock = stdin.lock();
|
||||
return Ok(word_count_from_reader(stdin_lock, settings, path)?);
|
||||
} else {
|
||||
let path_obj = Path::new(path);
|
||||
if path_obj.is_dir() {
|
||||
return Err(WcError::IsDirectory(path.clone()));
|
||||
} else {
|
||||
let file = File::open(path)?;
|
||||
return Ok(word_count_from_reader(file, settings, path)?);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn wc(files: Vec<String>, settings: &Settings) -> Result<(), u32> {
|
||||
let mut total_word_count = WordCount::default();
|
||||
let mut results = vec![];
|
||||
let mut max_width: usize = 0;
|
||||
let mut error_count = 0;
|
||||
|
||||
let num_files = files.len();
|
||||
|
||||
for path in &files {
|
||||
let mut reader = open(&path[..])?;
|
||||
|
||||
let mut line_count: usize = 0;
|
||||
let mut word_count: usize = 0;
|
||||
let mut byte_count: usize = 0;
|
||||
let mut char_count: usize = 0;
|
||||
let mut longest_line_length: usize = 0;
|
||||
let mut raw_line = Vec::new();
|
||||
let mut ends_lf: bool;
|
||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
||||
// hence the option wrapped in a result here
|
||||
while match reader.read_until(LF, &mut raw_line) {
|
||||
Ok(n) if n > 0 => true,
|
||||
Err(ref e) if !raw_line.is_empty() => {
|
||||
show_warning!("Error while reading {}: {}", path, e);
|
||||
!raw_line.is_empty()
|
||||
}
|
||||
_ => false,
|
||||
} {
|
||||
// GNU 'wc' only counts lines that end in LF as lines
|
||||
ends_lf = *raw_line.last().unwrap() == LF;
|
||||
line_count += ends_lf as usize;
|
||||
|
||||
byte_count += raw_line.len();
|
||||
|
||||
if decode_chars {
|
||||
// try and convert the bytes to UTF-8 first
|
||||
let current_char_count;
|
||||
match from_utf8(&raw_line[..]) {
|
||||
Ok(line) => {
|
||||
word_count += line.split_whitespace().count();
|
||||
current_char_count = line.chars().count();
|
||||
}
|
||||
Err(..) => {
|
||||
word_count += raw_line.split(|&x| is_word_separator(x)).count();
|
||||
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
|
||||
}
|
||||
}
|
||||
char_count += current_char_count;
|
||||
if current_char_count > longest_line_length {
|
||||
// -L is a GNU 'wc' extension so same behavior on LF
|
||||
longest_line_length = current_char_count - (ends_lf as usize);
|
||||
}
|
||||
}
|
||||
|
||||
raw_line.truncate(0);
|
||||
}
|
||||
|
||||
results.push(Result {
|
||||
title: path.clone(),
|
||||
bytes: byte_count,
|
||||
chars: char_count,
|
||||
lines: line_count,
|
||||
words: word_count,
|
||||
max_line_length: longest_line_length,
|
||||
let word_count = word_count_from_path(&path, settings).unwrap_or_else(|err| {
|
||||
show_error!("{}", err);
|
||||
error_count += 1;
|
||||
WordCount::default()
|
||||
});
|
||||
|
||||
total_line_count += line_count;
|
||||
max_width = max(max_width, word_count.bytes.to_string().len() + 1);
|
||||
total_word_count += word_count;
|
||||
total_char_count += char_count;
|
||||
total_byte_count += byte_count;
|
||||
|
||||
if longest_line_length > total_longest_line_length {
|
||||
total_longest_line_length = longest_line_length;
|
||||
}
|
||||
|
||||
// used for formatting
|
||||
max_width = total_byte_count.to_string().len() + 1;
|
||||
results.push(word_count.with_title(path));
|
||||
}
|
||||
|
||||
for result in &results {
|
||||
print_stats(settings, &result, max_width);
|
||||
if let Err(err) = print_stats(settings, &result, max_width) {
|
||||
show_warning!("failed to print result for {}: {}", result.title, err);
|
||||
error_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if files.len() > 1 {
|
||||
let result = Result {
|
||||
title: "total".to_owned(),
|
||||
bytes: total_byte_count,
|
||||
chars: total_char_count,
|
||||
lines: total_line_count,
|
||||
words: total_word_count,
|
||||
max_line_length: total_longest_line_length,
|
||||
};
|
||||
print_stats(settings, &result, max_width);
|
||||
if num_files > 1 {
|
||||
let total_result = total_word_count.with_title("total");
|
||||
if let Err(err) = print_stats(settings, &total_result, max_width) {
|
||||
show_warning!("failed to print total: {}", err);
|
||||
error_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if error_count == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(error_count)
|
||||
}
|
||||
}
|
||||
|
||||
fn print_stats(
|
||||
settings: &Settings,
|
||||
result: &TitledWordCount,
|
||||
mut min_width: usize,
|
||||
) -> WcResult<()> {
|
||||
let stdout = io::stdout();
|
||||
let mut stdout_lock = stdout.lock();
|
||||
|
||||
if settings.number_enabled() <= 1 {
|
||||
// Prevent a leading space in case we only need to display a single
|
||||
// number.
|
||||
min_width = 0;
|
||||
}
|
||||
|
||||
if settings.show_lines {
|
||||
write!(stdout_lock, "{:1$}", result.count.lines, min_width)?;
|
||||
}
|
||||
if settings.show_words {
|
||||
write!(stdout_lock, "{:1$}", result.count.words, min_width)?;
|
||||
}
|
||||
if settings.show_bytes {
|
||||
write!(stdout_lock, "{:1$}", result.count.bytes, min_width)?;
|
||||
}
|
||||
if settings.show_chars {
|
||||
write!(stdout_lock, "{:1$}", result.count.chars, min_width)?;
|
||||
}
|
||||
if settings.show_max_line_length {
|
||||
write!(
|
||||
stdout_lock,
|
||||
"{:1$}",
|
||||
result.count.max_line_length, min_width
|
||||
)?;
|
||||
}
|
||||
|
||||
if result.title == "-" {
|
||||
writeln!(stdout_lock, "")?;
|
||||
} else {
|
||||
writeln!(stdout_lock, " {}", result.title)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_stats(settings: &Settings, result: &Result, max_width: usize) {
|
||||
if settings.show_lines {
|
||||
print!("{:1$}", result.lines, max_width);
|
||||
}
|
||||
if settings.show_words {
|
||||
print!("{:1$}", result.words, max_width);
|
||||
}
|
||||
if settings.show_bytes {
|
||||
print!("{:1$}", result.bytes, max_width);
|
||||
}
|
||||
if settings.show_chars {
|
||||
print!("{:1$}", result.chars, max_width);
|
||||
}
|
||||
if settings.show_max_line_length {
|
||||
print!("{:1$}", result.max_line_length, max_width);
|
||||
}
|
||||
|
||||
if result.title != "-" {
|
||||
println!(" {}", result.title);
|
||||
} else {
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
fn open(path: &str) -> StdResult<BufReader<Box<dyn Read + 'static>>, i32> {
|
||||
if "-" == path {
|
||||
let reader = Box::new(stdin()) as Box<dyn Read>;
|
||||
return Ok(BufReader::new(reader));
|
||||
}
|
||||
|
||||
let fpath = Path::new(path);
|
||||
if fpath.is_dir() {
|
||||
show_info!("{}: is a directory", path);
|
||||
}
|
||||
match File::open(&fpath) {
|
||||
Ok(fd) => {
|
||||
let reader = Box::new(fd) as Box<dyn Read>;
|
||||
Ok(BufReader::new(reader))
|
||||
}
|
||||
Err(e) => {
|
||||
show_error!("wc: {}: {}", path, e);
|
||||
Err(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ fn test_stdin_line_len_regression() {
|
|||
.args(&["-L"])
|
||||
.pipe_in("\n123456")
|
||||
.run()
|
||||
.stdout_is(" 6\n");
|
||||
.stdout_is("6\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -34,7 +34,7 @@ fn test_stdin_only_bytes() {
|
|||
.args(&["-c"])
|
||||
.pipe_in_fixture("lorem_ipsum.txt")
|
||||
.run()
|
||||
.stdout_is(" 772\n");
|
||||
.stdout_is("772\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -59,7 +59,7 @@ fn test_single_only_lines() {
|
|||
new_ucmd!()
|
||||
.args(&["-l", "moby_dick.txt"])
|
||||
.run()
|
||||
.stdout_is(" 18 moby_dick.txt\n");
|
||||
.stdout_is("18 moby_dick.txt\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
Loading…
Reference in a new issue