wc: Do a chunked read with proper UTF-8 handling

This brings the results mostly in line with GNU wc and solves nasty
behavior with long lines.
This commit is contained in:
Jan Verbeek 2021-08-25 13:26:44 +02:00 committed by Michael Debertol
parent 48437fc49d
commit 6f7d740592
8 changed files with 114 additions and 144 deletions

8
Cargo.lock generated
View file

@ -2046,6 +2046,12 @@ dependencies = [
"log",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-width"
version = "0.1.5"
@ -3121,6 +3127,8 @@ dependencies = [
"libc",
"nix 0.20.0",
"thiserror",
"unicode-width",
"utf-8",
"uucore",
"uucore_procs",
]

View file

@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
thiserror = "1.0"
bytecount = "0.6.2"
utf-8 = "0.7.6"
unicode-width = "0.1.8"
[target.'cfg(unix)'.dependencies]
nix = "0.20"

View file

@ -4,7 +4,7 @@
//! for some common file-like objects. Use the [`WordCountable::lines`]
//! method to get an iterator over lines of a file-like object.
use std::fs::File;
use std::io::{self, BufRead, BufReader, Read, StdinLock};
use std::io::{BufRead, BufReader, Read, StdinLock};
#[cfg(unix)]
use std::os::unix::io::AsRawFd;
@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd;
#[cfg(unix)]
pub trait WordCountable: AsRawFd + Read {
type Buffered: BufRead;
fn lines(self) -> Lines<Self::Buffered>;
fn buffered(self) -> Self::Buffered;
}
#[cfg(not(unix))]
pub trait WordCountable: Read {
type Buffered: BufRead;
fn lines(self) -> Lines<Self::Buffered>;
fn buffered(self) -> Self::Buffered;
}
impl WordCountable for StdinLock<'_> {
type Buffered = Self;
fn lines(self) -> Lines<Self::Buffered>
where
Self: Sized,
{
Lines::new(self)
fn buffered(self) -> Self::Buffered {
self
}
}
impl WordCountable for File {
type Buffered = BufReader<Self>;
fn lines(self) -> Lines<Self::Buffered>
where
Self: Sized,
{
Lines::new(BufReader::new(self))
}
}
/// An iterator over the lines of an instance of `BufRead`.
///
/// Similar to [`io::Lines`] but yields each line as a `Vec<u8>` and
/// includes the newline character (`\n`, the `0xA` byte) that
/// terminates the line.
///
/// [`io::Lines`]:: io::Lines
pub struct Lines<B> {
buf: B,
line: Vec<u8>,
}
impl<B: BufRead> Lines<B> {
fn new(reader: B) -> Self {
Lines {
buf: reader,
line: Vec::new(),
}
}
pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
self.line.clear();
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
// hence the option wrapped in a result here
match self.buf.read_until(b'\n', &mut self.line) {
Ok(0) => None,
Ok(_n) => Some(Ok(&self.line)),
Err(e) => Some(Err(e)),
}
fn buffered(self) -> Self::Buffered {
BufReader::new(self)
}
}

View file

@ -13,11 +13,14 @@ mod countable;
mod word_count;
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
use countable::WordCountable;
use unicode_width::UnicodeWidthChar;
use utf8::{BufReadDecoder, BufReadDecoderError};
use word_count::{TitledWordCount, WordCount};
use clap::{crate_version, App, Arg, ArgMatches};
use thiserror::Error;
use std::cmp::max;
use std::fs::{self, File};
use std::io::{self, ErrorKind, Write};
use std::path::Path;
@ -224,16 +227,59 @@ fn word_count_from_reader<T: WordCountable>(
return count_bytes_and_lines_fast(&mut reader);
}
// Sum the WordCount for each line. Show a warning for each line
// that results in an IO error when trying to read it.
let mut lines = reader.lines();
let mut total = WordCount::default();
while let Some(res) = lines.next() {
match res {
Ok(line) => total += WordCount::from_line(line),
Err(e) => show_warning!("Error while reading {}: {}", path, e),
let mut reader = BufReadDecoder::new(reader.buffered());
let mut in_word = false;
let mut current_len = 0;
while let Some(chunk) = reader.next_strict() {
match chunk {
Ok(text) => {
for ch in text.chars() {
if ch.is_whitespace() {
in_word = false;
} else if ch.is_ascii_control() {
// These count as characters but do not affect the word state
} else if !in_word {
in_word = true;
total.words += 1;
}
match ch {
'\n' => {
total.max_line_length = max(current_len, total.max_line_length);
current_len = 0;
total.lines += 1;
}
// '\x0c' = '\f'
'\r' | '\x0c' => {
total.max_line_length = max(current_len, total.max_line_length);
current_len = 0;
}
'\t' => {
current_len -= current_len % 8;
current_len += 8;
}
_ => {
current_len += ch.width().unwrap_or(0);
}
}
total.chars += 1;
}
total.bytes += text.len();
}
Err(BufReadDecoderError::InvalidByteSequence(bytes)) => {
// GNU wc treats invalid data as neither word nor char nor whitespace,
// so no other counters are affected
total.bytes += bytes.len();
}
Err(BufReadDecoderError::Io(e)) => {
show_warning!("Error while reading {}: {}", path, e);
}
}
}
total.max_line_length = max(current_len, total.max_line_length);
Ok(total)
}

View file

@ -1,19 +1,5 @@
use std::cmp::max;
use std::iter::Sum;
use std::ops::{Add, AddAssign};
use std::str::from_utf8;
const CR: u8 = b'\r';
const LF: u8 = b'\n';
const SPACE: u8 = b' ';
const TAB: u8 = b'\t';
const SYN: u8 = 0x16_u8;
const FF: u8 = 0x0C_u8;
#[inline(always)]
fn is_word_separator(byte: u8) -> bool {
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
}
#[derive(Debug, Default, Copy, Clone)]
pub struct WordCount {
@ -44,76 +30,10 @@ impl AddAssign for WordCount {
}
}
impl Sum for WordCount {
fn sum<I>(iter: I) -> WordCount
where
I: Iterator<Item = WordCount>,
{
iter.fold(WordCount::default(), |acc, x| acc + x)
}
}
impl WordCount {
/// Count the characters and whitespace-separated words in the given bytes.
///
/// `line` is a slice of bytes that will be decoded as ASCII characters.
fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
let word_count = line.split(|&x| is_word_separator(x)).count();
let char_count = line.iter().filter(|c| c.is_ascii()).count();
(word_count, char_count)
}
/// Create a [`WordCount`] from a sequence of bytes representing a line.
///
/// If the last byte of `line` encodes a newline character (`\n`),
/// then the [`lines`] field will be set to 1. Otherwise, it will
/// be set to 0. The [`bytes`] field is simply the length of
/// `line`.
///
/// If `decode_chars` is `false`, the [`chars`] and [`words`]
/// fields will be set to 0. If it is `true`, this function will
/// attempt to decode the bytes first as UTF-8, and failing that,
/// as ASCII.
pub fn from_line(line: &[u8]) -> WordCount {
// GNU 'wc' only counts lines that end in LF as lines
let lines = (*line.last().unwrap() == LF) as usize;
let bytes = line.len();
let (words, chars) = WordCount::word_and_char_count(line);
// -L is a GNU 'wc' extension so same behavior on LF
let max_line_length = if chars > 0 { chars - lines } else { 0 };
WordCount {
bytes,
chars,
lines,
words,
max_line_length,
}
}
/// Count the UTF-8 characters and words in the given string slice.
///
/// `s` is a string slice that is assumed to be a UTF-8 string.
fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
let word_count = s.split_whitespace().count();
let char_count = s.chars().count();
(word_count, char_count)
}
pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
TitledWordCount { title, count: self }
}
/// Count the characters and words in the given slice of bytes.
///
/// `line` is a slice of bytes that will be decoded as UTF-8
/// characters, or if that fails, as ASCII characters.
fn word_and_char_count(line: &[u8]) -> (usize, usize) {
// try and convert the bytes to UTF-8 first
match from_utf8(line) {
Ok(s) => WordCount::utf8_word_and_char_count(s),
Err(..) => WordCount::ascii_word_and_char_count(line),
}
}
}
/// This struct supplements the actual word count with an optional title that is

View file

@ -53,11 +53,16 @@ fn test_utf8() {
.args(&["-lwmcL"])
.pipe_in_fixture("UTF_8_test.txt")
.run()
.stdout_is(" 300 4969 22781 22213 79\n");
// GNU returns " 300 2086 22219 22781 79"
//
// TODO: we should fix the word, character, and byte count to
// match the behavior of GNU wc
.stdout_is(" 303 2119 23025 22457 79\n");
}
#[test]
fn test_utf8_extra() {
new_ucmd!()
.arg("-lwmcL")
.pipe_in_fixture("UTF_8_weirdchars.txt")
.run()
.stdout_is(" 25 87 513 442 48\n");
}
#[test]

View file

@ -72,17 +72,20 @@ You should see the Greek word 'kosme': "κόσμε"
2.1.2 2 bytes (U-00000080): "€" |
2.1.3 3 bytes (U-00000800): "ࠀ" |
2.1.4 4 bytes (U-00010000): "𐀀" |
2.1.5 5 bytes (U-00200000): "øˆ€€€" |
2.1.6 6 bytes (U-04000000): "ü„€€€€" |
2.1.5 5 bytes (U-00200000): "øˆ€x€€" |
2.1.6 6 bytes (U-04000000): "ü„€x€€€" |
[uutils note: We don't recognize codepoints above 10FFFF, while GNU wc does. ]
[This discrepancy is acceptable, so 'x'es have been inserted to still make ]
[the results identical to our implementation. ]
|
2.2 Last possible sequence of a certain length |
|
2.2.1 1 byte (U-0000007F): ""
2.2.2 2 bytes (U-000007FF): "߿" |
2.2.3 3 bytes (U-0000FFFF): "￿" |
2.2.4 4 bytes (U-001FFFFF): "÷¿¿¿" |
2.2.5 5 bytes (U-03FFFFFF): "û¿¿¿¿" |
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿¿¿¿" |
2.2.4 4 bytes (U-001FFFFF): "÷¿¿x¿" |
2.2.5 5 bytes (U-03FFFFFF): "û¿¿x¿¿" |
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿x¿¿¿" |
|
2.3 Other boundary conditions |
|
@ -90,7 +93,7 @@ You should see the Greek word 'kosme': "κόσμε"
2.3.2 U-0000E000 = ee 80 80 = "" |
2.3.3 U-0000FFFD = ef bf bd = "<22>" |
2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" |
2.3.5 U-00110000 = f4 90 80 80 = "ô<>€€" |
2.3.5 U-00110000 = f4 90 80 80 = "ô<>x€€" |
|
3 Malformed sequences |
|

25
tests/fixtures/wc/UTF_8_weirdchars.txt vendored Normal file
View file

@ -0,0 +1,25 @@
zero-width space inbetween these: xx
and inbetween two spaces: [ ]
and at the end of the line:
non-breaking space: x x [   ]  
simple unicode: xµx [ µ ] µ
wide: xx [ ]
simple emoji: x👩x [ 👩 ] 👩
complex emoji: x👩🔬x [ 👩‍🔬 ] 👩‍🔬
, !
line feed: x x [ ]
vertical tab: x x [ ]
horizontal tab: x x [ ]
this should be the longest line:
1234567 12345678 123456781234567812345678
Control character: xx [  ]