mirror of
https://github.com/uutils/coreutils
synced 2024-11-05 14:21:32 +00:00
wc: Do a chunked read with proper UTF-8 handling
This brings the results mostly in line with GNU wc and solves nasty behavior with long lines.
This commit is contained in:
parent
48437fc49d
commit
6f7d740592
8 changed files with 114 additions and 144 deletions
8
Cargo.lock
generated
8
Cargo.lock
generated
|
@ -2046,6 +2046,12 @@ dependencies = [
|
|||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.5"
|
||||
|
@ -3121,6 +3127,8 @@ dependencies = [
|
|||
"libc",
|
||||
"nix 0.20.0",
|
||||
"thiserror",
|
||||
"unicode-width",
|
||||
"utf-8",
|
||||
"uucore",
|
||||
"uucore_procs",
|
||||
]
|
||||
|
|
|
@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
|||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||
thiserror = "1.0"
|
||||
bytecount = "0.6.2"
|
||||
utf-8 = "0.7.6"
|
||||
unicode-width = "0.1.8"
|
||||
|
||||
[target.'cfg(unix)'.dependencies]
|
||||
nix = "0.20"
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
//! for some common file-like objects. Use the [`WordCountable::lines`]
|
||||
//! method to get an iterator over lines of a file-like object.
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, BufReader, Read, StdinLock};
|
||||
use std::io::{BufRead, BufReader, Read, StdinLock};
|
||||
|
||||
#[cfg(unix)]
|
||||
use std::os::unix::io::AsRawFd;
|
||||
|
@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd;
|
|||
#[cfg(unix)]
|
||||
pub trait WordCountable: AsRawFd + Read {
|
||||
type Buffered: BufRead;
|
||||
fn lines(self) -> Lines<Self::Buffered>;
|
||||
fn buffered(self) -> Self::Buffered;
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
pub trait WordCountable: Read {
|
||||
type Buffered: BufRead;
|
||||
fn lines(self) -> Lines<Self::Buffered>;
|
||||
fn buffered(self) -> Self::Buffered;
|
||||
}
|
||||
|
||||
impl WordCountable for StdinLock<'_> {
|
||||
type Buffered = Self;
|
||||
|
||||
fn lines(self) -> Lines<Self::Buffered>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Lines::new(self)
|
||||
fn buffered(self) -> Self::Buffered {
|
||||
self
|
||||
}
|
||||
}
|
||||
impl WordCountable for File {
|
||||
type Buffered = BufReader<Self>;
|
||||
|
||||
fn lines(self) -> Lines<Self::Buffered>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Lines::new(BufReader::new(self))
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the lines of an instance of `BufRead`.
|
||||
///
|
||||
/// Similar to [`io::Lines`] but yields each line as a `Vec<u8>` and
|
||||
/// includes the newline character (`\n`, the `0xA` byte) that
|
||||
/// terminates the line.
|
||||
///
|
||||
/// [`io::Lines`]:: io::Lines
|
||||
pub struct Lines<B> {
|
||||
buf: B,
|
||||
line: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<B: BufRead> Lines<B> {
|
||||
fn new(reader: B) -> Self {
|
||||
Lines {
|
||||
buf: reader,
|
||||
line: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
|
||||
self.line.clear();
|
||||
|
||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
||||
// hence the option wrapped in a result here
|
||||
match self.buf.read_until(b'\n', &mut self.line) {
|
||||
Ok(0) => None,
|
||||
Ok(_n) => Some(Ok(&self.line)),
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
fn buffered(self) -> Self::Buffered {
|
||||
BufReader::new(self)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,11 +13,14 @@ mod countable;
|
|||
mod word_count;
|
||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
|
||||
use countable::WordCountable;
|
||||
use unicode_width::UnicodeWidthChar;
|
||||
use utf8::{BufReadDecoder, BufReadDecoderError};
|
||||
use word_count::{TitledWordCount, WordCount};
|
||||
|
||||
use clap::{crate_version, App, Arg, ArgMatches};
|
||||
use thiserror::Error;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, ErrorKind, Write};
|
||||
use std::path::Path;
|
||||
|
@ -224,16 +227,59 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
return count_bytes_and_lines_fast(&mut reader);
|
||||
}
|
||||
|
||||
// Sum the WordCount for each line. Show a warning for each line
|
||||
// that results in an IO error when trying to read it.
|
||||
let mut lines = reader.lines();
|
||||
let mut total = WordCount::default();
|
||||
while let Some(res) = lines.next() {
|
||||
match res {
|
||||
Ok(line) => total += WordCount::from_line(line),
|
||||
Err(e) => show_warning!("Error while reading {}: {}", path, e),
|
||||
let mut reader = BufReadDecoder::new(reader.buffered());
|
||||
let mut in_word = false;
|
||||
let mut current_len = 0;
|
||||
|
||||
while let Some(chunk) = reader.next_strict() {
|
||||
match chunk {
|
||||
Ok(text) => {
|
||||
for ch in text.chars() {
|
||||
if ch.is_whitespace() {
|
||||
in_word = false;
|
||||
} else if ch.is_ascii_control() {
|
||||
// These count as characters but do not affect the word state
|
||||
} else if !in_word {
|
||||
in_word = true;
|
||||
total.words += 1;
|
||||
}
|
||||
match ch {
|
||||
'\n' => {
|
||||
total.max_line_length = max(current_len, total.max_line_length);
|
||||
current_len = 0;
|
||||
total.lines += 1;
|
||||
}
|
||||
// '\x0c' = '\f'
|
||||
'\r' | '\x0c' => {
|
||||
total.max_line_length = max(current_len, total.max_line_length);
|
||||
current_len = 0;
|
||||
}
|
||||
'\t' => {
|
||||
current_len -= current_len % 8;
|
||||
current_len += 8;
|
||||
}
|
||||
_ => {
|
||||
current_len += ch.width().unwrap_or(0);
|
||||
}
|
||||
}
|
||||
total.chars += 1;
|
||||
}
|
||||
total.bytes += text.len();
|
||||
}
|
||||
Err(BufReadDecoderError::InvalidByteSequence(bytes)) => {
|
||||
// GNU wc treats invalid data as neither word nor char nor whitespace,
|
||||
// so no other counters are affected
|
||||
total.bytes += bytes.len();
|
||||
}
|
||||
Err(BufReadDecoderError::Io(e)) => {
|
||||
show_warning!("Error while reading {}: {}", path, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total.max_line_length = max(current_len, total.max_line_length);
|
||||
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
|
|
|
@ -1,19 +1,5 @@
|
|||
use std::cmp::max;
|
||||
use std::iter::Sum;
|
||||
use std::ops::{Add, AddAssign};
|
||||
use std::str::from_utf8;
|
||||
|
||||
const CR: u8 = b'\r';
|
||||
const LF: u8 = b'\n';
|
||||
const SPACE: u8 = b' ';
|
||||
const TAB: u8 = b'\t';
|
||||
const SYN: u8 = 0x16_u8;
|
||||
const FF: u8 = 0x0C_u8;
|
||||
|
||||
#[inline(always)]
|
||||
fn is_word_separator(byte: u8) -> bool {
|
||||
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Copy, Clone)]
|
||||
pub struct WordCount {
|
||||
|
@ -44,76 +30,10 @@ impl AddAssign for WordCount {
|
|||
}
|
||||
}
|
||||
|
||||
impl Sum for WordCount {
|
||||
fn sum<I>(iter: I) -> WordCount
|
||||
where
|
||||
I: Iterator<Item = WordCount>,
|
||||
{
|
||||
iter.fold(WordCount::default(), |acc, x| acc + x)
|
||||
}
|
||||
}
|
||||
|
||||
impl WordCount {
|
||||
/// Count the characters and whitespace-separated words in the given bytes.
|
||||
///
|
||||
/// `line` is a slice of bytes that will be decoded as ASCII characters.
|
||||
fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
|
||||
let word_count = line.split(|&x| is_word_separator(x)).count();
|
||||
let char_count = line.iter().filter(|c| c.is_ascii()).count();
|
||||
(word_count, char_count)
|
||||
}
|
||||
|
||||
/// Create a [`WordCount`] from a sequence of bytes representing a line.
|
||||
///
|
||||
/// If the last byte of `line` encodes a newline character (`\n`),
|
||||
/// then the [`lines`] field will be set to 1. Otherwise, it will
|
||||
/// be set to 0. The [`bytes`] field is simply the length of
|
||||
/// `line`.
|
||||
///
|
||||
/// If `decode_chars` is `false`, the [`chars`] and [`words`]
|
||||
/// fields will be set to 0. If it is `true`, this function will
|
||||
/// attempt to decode the bytes first as UTF-8, and failing that,
|
||||
/// as ASCII.
|
||||
pub fn from_line(line: &[u8]) -> WordCount {
|
||||
// GNU 'wc' only counts lines that end in LF as lines
|
||||
let lines = (*line.last().unwrap() == LF) as usize;
|
||||
let bytes = line.len();
|
||||
let (words, chars) = WordCount::word_and_char_count(line);
|
||||
// -L is a GNU 'wc' extension so same behavior on LF
|
||||
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
||||
WordCount {
|
||||
bytes,
|
||||
chars,
|
||||
lines,
|
||||
words,
|
||||
max_line_length,
|
||||
}
|
||||
}
|
||||
|
||||
/// Count the UTF-8 characters and words in the given string slice.
|
||||
///
|
||||
/// `s` is a string slice that is assumed to be a UTF-8 string.
|
||||
fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
|
||||
let word_count = s.split_whitespace().count();
|
||||
let char_count = s.chars().count();
|
||||
(word_count, char_count)
|
||||
}
|
||||
|
||||
pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
|
||||
TitledWordCount { title, count: self }
|
||||
}
|
||||
|
||||
/// Count the characters and words in the given slice of bytes.
|
||||
///
|
||||
/// `line` is a slice of bytes that will be decoded as UTF-8
|
||||
/// characters, or if that fails, as ASCII characters.
|
||||
fn word_and_char_count(line: &[u8]) -> (usize, usize) {
|
||||
// try and convert the bytes to UTF-8 first
|
||||
match from_utf8(line) {
|
||||
Ok(s) => WordCount::utf8_word_and_char_count(s),
|
||||
Err(..) => WordCount::ascii_word_and_char_count(line),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct supplements the actual word count with an optional title that is
|
||||
|
|
|
@ -53,11 +53,16 @@ fn test_utf8() {
|
|||
.args(&["-lwmcL"])
|
||||
.pipe_in_fixture("UTF_8_test.txt")
|
||||
.run()
|
||||
.stdout_is(" 300 4969 22781 22213 79\n");
|
||||
// GNU returns " 300 2086 22219 22781 79"
|
||||
//
|
||||
// TODO: we should fix the word, character, and byte count to
|
||||
// match the behavior of GNU wc
|
||||
.stdout_is(" 303 2119 23025 22457 79\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_extra() {
|
||||
new_ucmd!()
|
||||
.arg("-lwmcL")
|
||||
.pipe_in_fixture("UTF_8_weirdchars.txt")
|
||||
.run()
|
||||
.stdout_is(" 25 87 513 442 48\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
15
tests/fixtures/wc/UTF_8_test.txt
vendored
15
tests/fixtures/wc/UTF_8_test.txt
vendored
|
@ -72,17 +72,20 @@ You should see the Greek word 'kosme': "κόσμε"
|
|||
2.1.2 2 bytes (U-00000080): "" |
|
||||
2.1.3 3 bytes (U-00000800): "ࠀ" |
|
||||
2.1.4 4 bytes (U-00010000): "𐀀" |
|
||||
2.1.5 5 bytes (U-00200000): "øˆ€€€" |
|
||||
2.1.6 6 bytes (U-04000000): "ü„€€€€" |
|
||||
2.1.5 5 bytes (U-00200000): "øˆ€x€€" |
|
||||
2.1.6 6 bytes (U-04000000): "ü„€x€€€" |
|
||||
[uutils note: We don't recognize codepoints above 10FFFF, while GNU wc does. ]
|
||||
[This discrepancy is acceptable, so 'x'es have been inserted to still make ]
|
||||
[the results identical to our implementation. ]
|
||||
|
|
||||
2.2 Last possible sequence of a certain length |
|
||||
|
|
||||
2.2.1 1 byte (U-0000007F): ""
|
||||
2.2.2 2 bytes (U-000007FF): "߿" |
|
||||
2.2.3 3 bytes (U-0000FFFF): "" |
|
||||
2.2.4 4 bytes (U-001FFFFF): "÷¿¿¿" |
|
||||
2.2.5 5 bytes (U-03FFFFFF): "û¿¿¿¿" |
|
||||
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿¿¿¿" |
|
||||
2.2.4 4 bytes (U-001FFFFF): "÷¿¿x¿" |
|
||||
2.2.5 5 bytes (U-03FFFFFF): "û¿¿x¿¿" |
|
||||
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿x¿¿¿" |
|
||||
|
|
||||
2.3 Other boundary conditions |
|
||||
|
|
||||
|
@ -90,7 +93,7 @@ You should see the Greek word 'kosme': "κόσμε"
|
|||
2.3.2 U-0000E000 = ee 80 80 = "" |
|
||||
2.3.3 U-0000FFFD = ef bf bd = "<22>" |
|
||||
2.3.4 U-0010FFFF = f4 8f bf bf = "" |
|
||||
2.3.5 U-00110000 = f4 90 80 80 = "ô<>€€" |
|
||||
2.3.5 U-00110000 = f4 90 80 80 = "ô<>x€€" |
|
||||
|
|
||||
3 Malformed sequences |
|
||||
|
|
||||
|
|
25
tests/fixtures/wc/UTF_8_weirdchars.txt
vendored
Normal file
25
tests/fixtures/wc/UTF_8_weirdchars.txt
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
zero-width space inbetween these: xx
|
||||
and inbetween two spaces: [ ]
|
||||
and at the end of the line:
|
||||
|
||||
non-breaking space: x x [ ]
|
||||
|
||||
simple unicode: xµx [ µ ] µ
|
||||
|
||||
wide: xwx [ w ] w
|
||||
|
||||
simple emoji: x👩x [ 👩 ] 👩
|
||||
|
||||
complex emoji: x👩🔬x [ 👩🔬 ] 👩🔬
|
||||
|
||||
Hello, world!
|
||||
|
||||
line feed: xx [ ]
|
||||
|
||||
vertical tab: xx [ ]
|
||||
|
||||
horizontal tab: x x [ ]
|
||||
this should be the longest line:
|
||||
1234567 12345678 123456781234567812345678
|
||||
|
||||
Control character: xx [ ]
|
Loading…
Reference in a new issue