wc: Do a chunked read with proper UTF-8 handling

This brings the results mostly in line with GNU wc and solves nasty behavior with long lines.
2024-11-05 14:21:32 +00:00 · 2021-08-25 13:26:44 +02:00 · 2021-08-25 13:26:44 +02:00 · 6f7d740592
commit 6f7d740592
parent 48437fc49d
8 changed files with 114 additions and 144 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2046,6 +2046,12 @@ dependencies = [
 "log",
 ]

+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
 [[package]]
 name = "utf8-width"
 version = "0.1.5"
@ -3121,6 +3127,8 @@ dependencies = [
 "libc",
 "nix 0.20.0",
 "thiserror",
+ "unicode-width",
+ "utf-8",
 "uucore",
 "uucore_procs",
 ]
--- a/src/uu/wc/Cargo.toml
+++ b/src/uu/wc/Cargo.toml
@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
 uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
 thiserror = "1.0"
 bytecount = "0.6.2"
+utf-8 = "0.7.6"
+unicode-width = "0.1.8"

 [target.'cfg(unix)'.dependencies]
 nix = "0.20"
--- a/src/uu/wc/src/countable.rs
+++ b/src/uu/wc/src/countable.rs
@ -4,7 +4,7 @@
 //! for some common file-like objects. Use the [`WordCountable::lines`]
 //! method to get an iterator over lines of a file-like object.
 use std::fs::File;
-use std::io::{self, BufRead, BufReader, Read, StdinLock};
+use std::io::{BufRead, BufReader, Read, StdinLock};

 #[cfg(unix)]
 use std::os::unix::io::AsRawFd;
@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd;
 #[cfg(unix)]
 pub trait WordCountable: AsRawFd + Read {
    type Buffered: BufRead;
-    fn lines(self) -> Lines<Self::Buffered>;
+    fn buffered(self) -> Self::Buffered;
 }

 #[cfg(not(unix))]
 pub trait WordCountable: Read {
    type Buffered: BufRead;
-    fn lines(self) -> Lines<Self::Buffered>;
+    fn buffered(self) -> Self::Buffered;
 }

 impl WordCountable for StdinLock<'_> {
    type Buffered = Self;

-    fn lines(self) -> Lines<Self::Buffered>
-    where
-        Self: Sized,
-    {
-        Lines::new(self)
+    fn buffered(self) -> Self::Buffered {
+        self
    }
 }
 impl WordCountable for File {
    type Buffered = BufReader<Self>;

-    fn lines(self) -> Lines<Self::Buffered>
-    where
-        Self: Sized,
-    {
-        Lines::new(BufReader::new(self))
-    }
-}
-
-/// An iterator over the lines of an instance of `BufRead`.
-///
-/// Similar to [`io::Lines`] but yields each line as a `Vec<u8>` and
-/// includes the newline character (`\n`, the `0xA` byte) that
-/// terminates the line.
-///
-/// [`io::Lines`]:: io::Lines
-pub struct Lines<B> {
-    buf: B,
-    line: Vec<u8>,
-}
-
-impl<B: BufRead> Lines<B> {
-    fn new(reader: B) -> Self {
-        Lines {
-            buf: reader,
-            line: Vec::new(),
-        }
-    }
-
-    pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
-        self.line.clear();
-
-        // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
-        // hence the option wrapped in a result here
-        match self.buf.read_until(b'\n', &mut self.line) {
-            Ok(0) => None,
-            Ok(_n) => Some(Ok(&self.line)),
-            Err(e) => Some(Err(e)),
-        }
+    fn buffered(self) -> Self::Buffered {
+        BufReader::new(self)
    }
 }
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@ -13,11 +13,14 @@ mod countable;
 mod word_count;
 use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
 use countable::WordCountable;
+use unicode_width::UnicodeWidthChar;
+use utf8::{BufReadDecoder, BufReadDecoderError};
 use word_count::{TitledWordCount, WordCount};

 use clap::{crate_version, App, Arg, ArgMatches};
 use thiserror::Error;

+use std::cmp::max;
 use std::fs::{self, File};
 use std::io::{self, ErrorKind, Write};
 use std::path::Path;
@ -224,16 +227,59 @@ fn word_count_from_reader<T: WordCountable>(
        return count_bytes_and_lines_fast(&mut reader);
    }

-    // Sum the WordCount for each line. Show a warning for each line
-    // that results in an IO error when trying to read it.
-    let mut lines = reader.lines();
    let mut total = WordCount::default();
-    while let Some(res) = lines.next() {
-        match res {
-            Ok(line) => total += WordCount::from_line(line),
-            Err(e) => show_warning!("Error while reading {}: {}", path, e),
+    let mut reader = BufReadDecoder::new(reader.buffered());
+    let mut in_word = false;
+    let mut current_len = 0;
+
+    while let Some(chunk) = reader.next_strict() {
+        match chunk {
+            Ok(text) => {
+                for ch in text.chars() {
+                    if ch.is_whitespace() {
+                        in_word = false;
+                    } else if ch.is_ascii_control() {
+                        // These count as characters but do not affect the word state
+                    } else if !in_word {
+                        in_word = true;
+                        total.words += 1;
+                    }
+                    match ch {
+                        '\n' => {
+                            total.max_line_length = max(current_len, total.max_line_length);
+                            current_len = 0;
+                            total.lines += 1;
+                        }
+                        // '\x0c' = '\f'
+                        '\r' | '\x0c' => {
+                            total.max_line_length = max(current_len, total.max_line_length);
+                            current_len = 0;
+                        }
+                        '\t' => {
+                            current_len -= current_len % 8;
+                            current_len += 8;
+                        }
+                        _ => {
+                            current_len += ch.width().unwrap_or(0);
+                        }
+                    }
+                    total.chars += 1;
+                }
+                total.bytes += text.len();
+            }
+            Err(BufReadDecoderError::InvalidByteSequence(bytes)) => {
+                // GNU wc treats invalid data as neither word nor char nor whitespace,
+                // so no other counters are affected
+                total.bytes += bytes.len();
+            }
+            Err(BufReadDecoderError::Io(e)) => {
+                show_warning!("Error while reading {}: {}", path, e);
+            }
        }
    }
+
+    total.max_line_length = max(current_len, total.max_line_length);
+
    Ok(total)
 }

--- a/src/uu/wc/src/word_count.rs
+++ b/src/uu/wc/src/word_count.rs
@ -1,19 +1,5 @@
 use std::cmp::max;
-use std::iter::Sum;
 use std::ops::{Add, AddAssign};
-use std::str::from_utf8;
-
-const CR: u8 = b'\r';
-const LF: u8 = b'\n';
-const SPACE: u8 = b' ';
-const TAB: u8 = b'\t';
-const SYN: u8 = 0x16_u8;
-const FF: u8 = 0x0C_u8;
-
-#[inline(always)]
-fn is_word_separator(byte: u8) -> bool {
-    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
-}

 #[derive(Debug, Default, Copy, Clone)]
 pub struct WordCount {
@ -44,76 +30,10 @@ impl AddAssign for WordCount {
    }
 }

-impl Sum for WordCount {
-    fn sum<I>(iter: I) -> WordCount
-    where
-        I: Iterator<Item = WordCount>,
-    {
-        iter.fold(WordCount::default(), |acc, x| acc + x)
-    }
-}
-
 impl WordCount {
-    /// Count the characters and whitespace-separated words in the given bytes.
-    ///
-    /// `line` is a slice of bytes that will be decoded as ASCII characters.
-    fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
-        let word_count = line.split(|&x| is_word_separator(x)).count();
-        let char_count = line.iter().filter(|c| c.is_ascii()).count();
-        (word_count, char_count)
-    }
-
-    /// Create a [`WordCount`] from a sequence of bytes representing a line.
-    ///
-    /// If the last byte of `line` encodes a newline character (`\n`),
-    /// then the [`lines`] field will be set to 1. Otherwise, it will
-    /// be set to 0. The [`bytes`] field is simply the length of
-    /// `line`.
-    ///
-    /// If `decode_chars` is `false`, the [`chars`] and [`words`]
-    /// fields will be set to 0. If it is `true`, this function will
-    /// attempt to decode the bytes first as UTF-8, and failing that,
-    /// as ASCII.
-    pub fn from_line(line: &[u8]) -> WordCount {
-        // GNU 'wc' only counts lines that end in LF as lines
-        let lines = (*line.last().unwrap() == LF) as usize;
-        let bytes = line.len();
-        let (words, chars) = WordCount::word_and_char_count(line);
-        // -L is a GNU 'wc' extension so same behavior on LF
-        let max_line_length = if chars > 0 { chars - lines } else { 0 };
-        WordCount {
-            bytes,
-            chars,
-            lines,
-            words,
-            max_line_length,
-        }
-    }
-
-    /// Count the UTF-8 characters and words in the given string slice.
-    ///
-    /// `s` is a string slice that is assumed to be a UTF-8 string.
-    fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
-        let word_count = s.split_whitespace().count();
-        let char_count = s.chars().count();
-        (word_count, char_count)
-    }
-
    pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
        TitledWordCount { title, count: self }
    }
-
-    /// Count the characters and words in the given slice of bytes.
-    ///
-    /// `line` is a slice of bytes that will be decoded as UTF-8
-    /// characters, or if that fails, as ASCII characters.
-    fn word_and_char_count(line: &[u8]) -> (usize, usize) {
-        // try and convert the bytes to UTF-8 first
-        match from_utf8(line) {
-            Ok(s) => WordCount::utf8_word_and_char_count(s),
-            Err(..) => WordCount::ascii_word_and_char_count(line),
-        }
-    }
 }

 /// This struct supplements the actual word count with an optional title that is
--- a/tests/by-util/test_wc.rs
+++ b/tests/by-util/test_wc.rs
@ -53,11 +53,16 @@ fn test_utf8() {
        .args(&["-lwmcL"])
        .pipe_in_fixture("UTF_8_test.txt")
        .run()
-        .stdout_is("    300    4969   22781   22213      79\n");
-    // GNU returns "    300    2086   22219   22781      79"
-    //
-    // TODO: we should fix the word, character, and byte count to
-    // match the behavior of GNU wc
+        .stdout_is("    303    2119   23025   22457      79\n");
+}
+
+#[test]
+fn test_utf8_extra() {
+    new_ucmd!()
+        .arg("-lwmcL")
+        .pipe_in_fixture("UTF_8_weirdchars.txt")
+        .run()
+        .stdout_is("     25      87     513     442      48\n");
 }

 #[test]
--- a/tests/fixtures/wc/UTF_8_test.txt
+++ b/tests/fixtures/wc/UTF_8_test.txt
@ -72,17 +72,20 @@ You should see the Greek word 'kosme':       "κόσμε"
 2.1.2  2 bytes (U-00000080):        ""                                       |
 2.1.3  3 bytes (U-00000800):        "ࠀ"                                       |
 2.1.4  4 bytes (U-00010000):        "𐀀"                                       |
-2.1.5  5 bytes (U-00200000):        "øˆ€€€"                                       |
-2.1.6  6 bytes (U-04000000):        "ü„€€€€"                                       |
+2.1.5  5 bytes (U-00200000):        "øˆ€x€€"                                       |
+2.1.6  6 bytes (U-04000000):        "ü„€x€€€"                                       |
+[uutils note: We don't recognize codepoints above 10FFFF, while GNU wc does.  ]
+[This discrepancy is acceptable, so 'x'es have been inserted to still make    ]
+[the results identical to our implementation.                                 ]
                                                                              |
 2.2  Last possible sequence of a certain length                               |
                                                                              |
 2.2.1  1 byte  (U-0000007F):        ""                                        
 2.2.2  2 bytes (U-000007FF):        "߿"                                       |
 2.2.3  3 bytes (U-0000FFFF):        ""                                       |
-2.2.4  4 bytes (U-001FFFFF):        "÷¿¿¿"                                       |
-2.2.5  5 bytes (U-03FFFFFF):        "û¿¿¿¿"                                       |
-2.2.6  6 bytes (U-7FFFFFFF):        "ý¿¿¿¿¿"                                       |
+2.2.4  4 bytes (U-001FFFFF):        "÷¿¿x¿"                                       |
+2.2.5  5 bytes (U-03FFFFFF):        "û¿¿x¿¿"                                      |
+2.2.6  6 bytes (U-7FFFFFFF):        "ý¿¿x¿¿¿"                                      |
                                                                              |
 2.3  Other boundary conditions                                                |
                                                                              |
@ -90,7 +93,7 @@ You should see the Greek word 'kosme':       "κόσμε"
 2.3.2  U-0000E000 = ee 80 80 = ""                                            |
 2.3.3  U-0000FFFD = ef bf bd = "<22>"                                            |
 2.3.4  U-0010FFFF = f4 8f bf bf = "􏿿"                                         |
-2.3.5  U-00110000 = f4 90 80 80 = "ô<>€€"                                         |
+2.3.5  U-00110000 = f4 90 80 80 = "ô<>x€€"                                         |
                                                                              |
 3  Malformed sequences                                                        |
                                                                              |
--- a/tests/fixtures/wc/UTF_8_weirdchars.txt
+++ b/tests/fixtures/wc/UTF_8_weirdchars.txt
@ -0,0 +1,25 @@
+zero-width space inbetween these: xx
+and inbetween two spaces: [  ]
+and at the end of the line: 
+
+non-breaking space: x x [   ]  
+
+simple unicode: xµx [ µ ] µ
+
+wide: xｗx [ ｗ ] ｗ
+
+simple emoji: x👩x [ 👩 ] 👩
+
+complex emoji: x👩‍🔬x [ 👩‍🔬 ] 👩‍🔬
+
+Ｈｅｌｌｏ, ｗｏｒｌｄ!
+
+line feed: xx [  ] 
+
+vertical tab: xx [  ] 
+
+horizontal tab: x	x [ 	 ]
+this should be the longest line:
+1234567	12345678	123456781234567812345678
+
+Control character: xx [  ]