wc: Don't read() if we only need to count number of bytes (Version 2) (#1851)

* wc: Don't read() if we only need to count number of bytes * Resolve a few code review comments * Use write macros instead of print * Fix wc tests in case only one thing is printed * wc: Fix style * wc: Use return value of first splice rather than second * wc: Make main loop more readable * wc: Don't unwrap on failed write to stdout * wc: Increment error count when stats fail to print * Re-add Cargo.lock
2024-07-23 10:54:14 +00:00 · 2021-03-30 18:53:02 +00:00 · 2021-03-30 18:53:02 +00:00 · 698dab12a6
parent 3e06882acc
commit 698dab12a6
5 changed files with 1382 additions and 1152 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/src/uu/wc/Cargo.toml
+++ b/src/uu/wc/Cargo.toml
@ -18,6 +18,11 @@ path = "src/wc.rs"
 clap = "2.33"
 uucore = { version=">=0.0.7", package="uucore", path="../../uucore" }
 uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
+thiserror = "1.0"
+
+[target.'cfg(unix)'.dependencies]
+nix = "0.20"
+libc = "0.2"

 [[bin]]
 name = "wc"
--- a/src/uu/wc/src/count_bytes.rs
+++ b/src/uu/wc/src/count_bytes.rs
@ -0,0 +1,95 @@
+use super::{WcResult, WordCountable};
+
+#[cfg(any(target_os = "linux", target_os = "android"))]
+use std::fs::OpenOptions;
+use std::io::ErrorKind;
+
+#[cfg(unix)]
+use libc::S_IFREG;
+#[cfg(unix)]
+use nix::sys::stat::fstat;
+#[cfg(any(target_os = "linux", target_os = "android"))]
+use std::os::unix::io::{AsRawFd, RawFd};
+
+#[cfg(any(target_os = "linux", target_os = "android"))]
+use libc::S_IFIFO;
+#[cfg(any(target_os = "linux", target_os = "android"))]
+use nix::fcntl::{splice, SpliceFFlags};
+#[cfg(any(target_os = "linux", target_os = "android"))]
+use nix::unistd::pipe;
+
+const BUF_SIZE: usize = 16384;
+
+/// This is a Linux-specific function to count the number of bytes using the
+/// `splice` system call, which is faster than using `read`.
+#[inline]
+#[cfg(any(target_os = "linux", target_os = "android"))]
+fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
+    let null_file = OpenOptions::new()
+        .write(true)
+        .open("/dev/null")
+        .map_err(|_| nix::Error::last())?;
+    let null = null_file.as_raw_fd();
+    let (pipe_rd, pipe_wr) = pipe()?;
+
+    let mut byte_count = 0;
+    loop {
+        let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?;
+        if res == 0 {
+            break;
+        }
+        byte_count += res;
+        splice(pipe_rd, None, null, None, res, SpliceFFlags::empty())?;
+    }
+
+    Ok(byte_count)
+}
+
+/// In the special case where we only need to count the number of bytes. There
+/// are several optimizations we can do:
+///   1. On Unix,  we can simply `stat` the file if it is regular.
+///   2. On Linux -- if the above did not work -- we can use splice to count
+///      the number of bytes if the file is a FIFO.
+///   3. Otherwise, we just read normally, but without the overhead of counting
+///      other things such as lines and words.
+#[inline]
+pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
+    #[cfg(unix)]
+    {
+        let fd = handle.as_raw_fd();
+        match fstat(fd) {
+            Ok(stat) => {
+                // If the file is regular, then the `st_size` should hold
+                // the file's size in bytes.
+                if (stat.st_mode & S_IFREG) != 0 {
+                    return Ok(stat.st_size as usize);
+                }
+                #[cfg(any(target_os = "linux", target_os = "android"))]
+                {
+                    // Else, if we're on Linux and our file is a FIFO pipe
+                    // (or stdin), we use splice to count the number of bytes.
+                    if (stat.st_mode & S_IFIFO) != 0 {
+                        if let Ok(n) = count_bytes_using_splice(fd) {
+                            return Ok(n);
+                        }
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+
+    // Fall back on `read`, but without the overhead of counting words and lines.
+    let mut buf = [0 as u8; BUF_SIZE];
+    let mut byte_count = 0;
+    loop {
+        match handle.read(&mut buf) {
+            Ok(0) => return Ok(byte_count),
+            Ok(n) => {
+                byte_count += n;
+            }
+            Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e.into()),
+        }
+    }
+}
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@ -10,14 +10,31 @@
 #[macro_use]
 extern crate uucore;

-use clap::{App, Arg, ArgMatches};
+mod count_bytes;
+use count_bytes::count_bytes_fast;

+use clap::{App, Arg, ArgMatches};
+use thiserror::Error;
+
+use std::cmp::max;
 use std::fs::File;
-use std::io::{stdin, BufRead, BufReader, Read};
+use std::io::{self, BufRead, BufReader, Read, StdinLock, Write};
+use std::ops::{Add, AddAssign};
+#[cfg(unix)]
+use std::os::unix::io::AsRawFd;
 use std::path::Path;
-use std::result::Result as StdResult;
 use std::str::from_utf8;

+#[derive(Error, Debug)]
+pub enum WcError {
+    #[error("{0}")]
+    Io(#[from] io::Error),
+    #[error("Expected a file, found directory {0}")]
+    IsDirectory(String),
+}
+
+type WcResult<T> = Result<T, WcError>;
+
 struct Settings {
    show_bytes: bool,
    show_chars: bool,
@ -53,10 +70,46 @@ impl Settings {
            show_max_line_length: false,
        }
    }
+
+    fn number_enabled(&self) -> u32 {
+        let mut result = 0;
+        result += self.show_bytes as u32;
+        result += self.show_chars as u32;
+        result += self.show_lines as u32;
+        result += self.show_max_line_length as u32;
+        result += self.show_words as u32;
+        result
+    }
 }

-struct Result {
-    title: String,
+#[cfg(unix)]
+trait WordCountable: AsRawFd + Read {
+    type Buffered: BufRead;
+    fn get_buffered(self) -> Self::Buffered;
+}
+#[cfg(not(unix))]
+trait WordCountable: Read {
+    type Buffered: BufRead;
+    fn get_buffered(self) -> Self::Buffered;
+}
+
+impl WordCountable for StdinLock<'_> {
+    type Buffered = Self;
+
+    fn get_buffered(self) -> Self::Buffered {
+        self
+    }
+}
+impl WordCountable for File {
+    type Buffered = BufReader<Self>;
+
+    fn get_buffered(self) -> Self::Buffered {
+        BufReader::new(self)
+    }
+}
+
+#[derive(Debug, Default, Copy, Clone)]
+struct WordCount {
    bytes: usize,
    chars: usize,
    lines: usize,
@ -64,6 +117,45 @@ struct Result {
    max_line_length: usize,
 }

+impl Add for WordCount {
+    type Output = Self;
+
+    fn add(self, other: Self) -> Self {
+        Self {
+            bytes: self.bytes + other.bytes,
+            chars: self.chars + other.chars,
+            lines: self.lines + other.lines,
+            words: self.words + other.words,
+            max_line_length: max(self.max_line_length, other.max_line_length),
+        }
+    }
+}
+
+impl AddAssign for WordCount {
+    fn add_assign(&mut self, other: Self) {
+        *self = *self + other
+    }
+}
+
+impl WordCount {
+    fn with_title<'a>(self, title: &'a str) -> TitledWordCount<'a> {
+        return TitledWordCount {
+            title: title,
+            count: self,
+        };
+    }
+}
+
+/// This struct supplements the actual word count with a title that is displayed
+/// to the user at the end of the program.
+/// The reason we don't simply include title in the `WordCount` struct is that
+/// it would result in unneccesary copying of `String`.
+#[derive(Debug, Default, Clone)]
+struct TitledWordCount<'a> {
+    title: &'a str,
+    count: WordCount,
+}
+
 static ABOUT: &str = "Display newline, word, and byte counts for each FILE, and a total line if
 more than one FILE is specified.";
 static VERSION: &str = env!("CARGO_PKG_VERSION");
@ -137,12 +229,11 @@ pub fn uumain(args: impl uucore::Args) -> i32 {

    let settings = Settings::new(&matches);

-    match wc(files, &settings) {
-        Ok(()) => ( /* pass */ ),
-        Err(e) => return e,
+    if wc(files, &settings).is_ok() {
+        0
+    } else {
+        1
    }
-
-    0
 }

 const CR: u8 = b'\r';
@ -157,151 +248,187 @@ fn is_word_separator(byte: u8) -> bool {
    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
 }

-fn wc(files: Vec<String>, settings: &Settings) -> StdResult<(), i32> {
-    let mut total_line_count: usize = 0;
-    let mut total_word_count: usize = 0;
-    let mut total_char_count: usize = 0;
-    let mut total_byte_count: usize = 0;
-    let mut total_longest_line_length: usize = 0;
-
-    let mut results = vec![];
-    let mut max_width: usize = 0;
+fn word_count_from_reader<T: WordCountable>(
+    mut reader: T,
+    settings: &Settings,
+    path: &String,
+) -> WcResult<WordCount> {
+    let only_count_bytes = settings.show_bytes
+        && (!(settings.show_chars
+            || settings.show_lines
+            || settings.show_max_line_length
+            || settings.show_words));
+    if only_count_bytes {
+        return Ok(WordCount {
+            bytes: count_bytes_fast(&mut reader)?,
+            ..WordCount::default()
+        });
+    }

    // we do not need to decode the byte stream if we're only counting bytes/newlines
    let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;

+    let mut line_count: usize = 0;
+    let mut word_count: usize = 0;
+    let mut byte_count: usize = 0;
+    let mut char_count: usize = 0;
+    let mut longest_line_length: usize = 0;
+    let mut raw_line = Vec::new();
+    let mut ends_lf: bool;
+
+    // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
+    // hence the option wrapped in a result here
+    let mut buffered_reader = reader.get_buffered();
+    loop {
+        match buffered_reader.read_until(LF, &mut raw_line) {
+            Ok(n) => {
+                if n == 0 {
+                    break;
+                }
+            }
+            Err(ref e) => {
+                if !raw_line.is_empty() {
+                    show_warning!("Error while reading {}: {}", path, e);
+                } else {
+                    break;
+                }
+            }
+        };
+
+        // GNU 'wc' only counts lines that end in LF as lines
+        ends_lf = *raw_line.last().unwrap() == LF;
+        line_count += ends_lf as usize;
+
+        byte_count += raw_line.len();
+
+        if decode_chars {
+            // try and convert the bytes to UTF-8 first
+            let current_char_count;
+            match from_utf8(&raw_line[..]) {
+                Ok(line) => {
+                    word_count += line.split_whitespace().count();
+                    current_char_count = line.chars().count();
+                }
+                Err(..) => {
+                    word_count += raw_line.split(|&x| is_word_separator(x)).count();
+                    current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
+                }
+            }
+            char_count += current_char_count;
+            if current_char_count > longest_line_length {
+                // -L is a GNU 'wc' extension so same behavior on LF
+                longest_line_length = current_char_count - (ends_lf as usize);
+            }
+        }
+
+        raw_line.truncate(0);
+    }
+
+    Ok(WordCount {
+        bytes: byte_count,
+        chars: char_count,
+        lines: line_count,
+        words: word_count,
+        max_line_length: longest_line_length,
+    })
+}
+
+fn word_count_from_path(path: &String, settings: &Settings) -> WcResult<WordCount> {
+    if path == "-" {
+        let stdin = io::stdin();
+        let stdin_lock = stdin.lock();
+        return Ok(word_count_from_reader(stdin_lock, settings, path)?);
+    } else {
+        let path_obj = Path::new(path);
+        if path_obj.is_dir() {
+            return Err(WcError::IsDirectory(path.clone()));
+        } else {
+            let file = File::open(path)?;
+            return Ok(word_count_from_reader(file, settings, path)?);
+        }
+    }
+}
+
+fn wc(files: Vec<String>, settings: &Settings) -> Result<(), u32> {
+    let mut total_word_count = WordCount::default();
+    let mut results = vec![];
+    let mut max_width: usize = 0;
+    let mut error_count = 0;
+
+    let num_files = files.len();
+
    for path in &files {
-        let mut reader = open(&path[..])?;
-
-        let mut line_count: usize = 0;
-        let mut word_count: usize = 0;
-        let mut byte_count: usize = 0;
-        let mut char_count: usize = 0;
-        let mut longest_line_length: usize = 0;
-        let mut raw_line = Vec::new();
-        let mut ends_lf: bool;
-        // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
-        // hence the option wrapped in a result here
-        while match reader.read_until(LF, &mut raw_line) {
-            Ok(n) if n > 0 => true,
-            Err(ref e) if !raw_line.is_empty() => {
-                show_warning!("Error while reading {}: {}", path, e);
-                !raw_line.is_empty()
-            }
-            _ => false,
-        } {
-            // GNU 'wc' only counts lines that end in LF as lines
-            ends_lf = *raw_line.last().unwrap() == LF;
-            line_count += ends_lf as usize;
-
-            byte_count += raw_line.len();
-
-            if decode_chars {
-                // try and convert the bytes to UTF-8 first
-                let current_char_count;
-                match from_utf8(&raw_line[..]) {
-                    Ok(line) => {
-                        word_count += line.split_whitespace().count();
-                        current_char_count = line.chars().count();
-                    }
-                    Err(..) => {
-                        word_count += raw_line.split(|&x| is_word_separator(x)).count();
-                        current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
-                    }
-                }
-                char_count += current_char_count;
-                if current_char_count > longest_line_length {
-                    // -L is a GNU 'wc' extension so same behavior on LF
-                    longest_line_length = current_char_count - (ends_lf as usize);
-                }
-            }
-
-            raw_line.truncate(0);
-        }
-
-        results.push(Result {
-            title: path.clone(),
-            bytes: byte_count,
-            chars: char_count,
-            lines: line_count,
-            words: word_count,
-            max_line_length: longest_line_length,
+        let word_count = word_count_from_path(&path, settings).unwrap_or_else(|err| {
+            show_error!("{}", err);
+            error_count += 1;
+            WordCount::default()
        });
-
-        total_line_count += line_count;
+        max_width = max(max_width, word_count.bytes.to_string().len() + 1);
        total_word_count += word_count;
-        total_char_count += char_count;
-        total_byte_count += byte_count;
-
-        if longest_line_length > total_longest_line_length {
-            total_longest_line_length = longest_line_length;
-        }
-
-        // used for formatting
-        max_width = total_byte_count.to_string().len() + 1;
+        results.push(word_count.with_title(path));
    }

    for result in &results {
-        print_stats(settings, &result, max_width);
+        if let Err(err) = print_stats(settings, &result, max_width) {
+            show_warning!("failed to print result for {}: {}", result.title, err);
+            error_count += 1;
+        }
    }

-    if files.len() > 1 {
-        let result = Result {
-            title: "total".to_owned(),
-            bytes: total_byte_count,
-            chars: total_char_count,
-            lines: total_line_count,
-            words: total_word_count,
-            max_line_length: total_longest_line_length,
-        };
-        print_stats(settings, &result, max_width);
+    if num_files > 1 {
+        let total_result = total_word_count.with_title("total");
+        if let Err(err) = print_stats(settings, &total_result, max_width) {
+            show_warning!("failed to print total: {}", err);
+            error_count += 1;
+        }
+    }
+
+    if error_count == 0 {
+        Ok(())
+    } else {
+        Err(error_count)
+    }
+}
+
+fn print_stats(
+    settings: &Settings,
+    result: &TitledWordCount,
+    mut min_width: usize,
+) -> WcResult<()> {
+    let stdout = io::stdout();
+    let mut stdout_lock = stdout.lock();
+
+    if settings.number_enabled() <= 1 {
+        // Prevent a leading space in case we only need to display a single
+        // number.
+        min_width = 0;
+    }
+
+    if settings.show_lines {
+        write!(stdout_lock, "{:1$}", result.count.lines, min_width)?;
+    }
+    if settings.show_words {
+        write!(stdout_lock, "{:1$}", result.count.words, min_width)?;
+    }
+    if settings.show_bytes {
+        write!(stdout_lock, "{:1$}", result.count.bytes, min_width)?;
+    }
+    if settings.show_chars {
+        write!(stdout_lock, "{:1$}", result.count.chars, min_width)?;
+    }
+    if settings.show_max_line_length {
+        write!(
+            stdout_lock,
+            "{:1$}",
+            result.count.max_line_length, min_width
+        )?;
+    }
+
+    if result.title == "-" {
+        writeln!(stdout_lock, "")?;
+    } else {
+        writeln!(stdout_lock, " {}", result.title)?;
    }

    Ok(())
 }
-
-fn print_stats(settings: &Settings, result: &Result, max_width: usize) {
-    if settings.show_lines {
-        print!("{:1$}", result.lines, max_width);
-    }
-    if settings.show_words {
-        print!("{:1$}", result.words, max_width);
-    }
-    if settings.show_bytes {
-        print!("{:1$}", result.bytes, max_width);
-    }
-    if settings.show_chars {
-        print!("{:1$}", result.chars, max_width);
-    }
-    if settings.show_max_line_length {
-        print!("{:1$}", result.max_line_length, max_width);
-    }
-
-    if result.title != "-" {
-        println!(" {}", result.title);
-    } else {
-        println!();
-    }
-}
-
-fn open(path: &str) -> StdResult<BufReader<Box<dyn Read + 'static>>, i32> {
-    if "-" == path {
-        let reader = Box::new(stdin()) as Box<dyn Read>;
-        return Ok(BufReader::new(reader));
-    }
-
-    let fpath = Path::new(path);
-    if fpath.is_dir() {
-        show_info!("{}: is a directory", path);
-    }
-    match File::open(&fpath) {
-        Ok(fd) => {
-            let reader = Box::new(fd) as Box<dyn Read>;
-            Ok(BufReader::new(reader))
-        }
-        Err(e) => {
-            show_error!("wc: {}: {}", path, e);
-            Err(1)
-        }
-    }
-}
--- a/tests/by-util/test_wc.rs
+++ b/tests/by-util/test_wc.rs
@ -25,7 +25,7 @@ fn test_stdin_line_len_regression() {
        .args(&["-L"])
        .pipe_in("\n123456")
        .run()
-        .stdout_is(" 6\n");
+        .stdout_is("6\n");
 }

 #[test]
@ -34,7 +34,7 @@ fn test_stdin_only_bytes() {
        .args(&["-c"])
        .pipe_in_fixture("lorem_ipsum.txt")
        .run()
-        .stdout_is(" 772\n");
+        .stdout_is("772\n");
 }

 #[test]
@ -59,7 +59,7 @@ fn test_single_only_lines() {
    new_ucmd!()
        .args(&["-l", "moby_dick.txt"])
        .run()
-        .stdout_is("   18 moby_dick.txt\n");
+        .stdout_is("18 moby_dick.txt\n");
 }

 #[test]