mirror of
https://github.com/uutils/coreutils
synced 2024-10-15 20:34:28 +00:00
Copy the UTF8 crate in the tree and remove utf8 dependency. (#4460)
This commit is contained in:
parent
6a54d820ac
commit
084510e499
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -2350,12 +2350,6 @@ dependencies = [
|
|||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.1"
|
||||
|
@ -3330,7 +3324,6 @@ dependencies = [
|
|||
"libc",
|
||||
"nix",
|
||||
"unicode-width",
|
||||
"utf-8",
|
||||
"uucore",
|
||||
]
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@ path = "src/wc.rs"
|
|||
clap = { workspace=true }
|
||||
uucore = { workspace=true, features=["pipes"] }
|
||||
bytecount = { workspace=true }
|
||||
utf-8 = { workspace=true }
|
||||
unicode-width = { workspace=true }
|
||||
|
||||
[target.'cfg(unix)'.dependencies]
|
||||
|
|
26
src/uu/wc/src/utf8/LICENSE
Normal file
26
src/uu/wc/src/utf8/LICENSE
Normal file
|
@ -0,0 +1,26 @@
|
|||
// spell-checker:ignore Sapin
|
||||
Copyright (c) Simon Sapin and many others
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
93
src/uu/wc/src/utf8/mod.rs
Normal file
93
src/uu/wc/src/utf8/mod.rs
Normal file
|
@ -0,0 +1,93 @@
|
|||
// spell-checker:ignore Sapin
|
||||
mod read;
|
||||
|
||||
pub use read::{BufReadDecoder, BufReadDecoderError};
|
||||
|
||||
use std::cmp;
|
||||
use std::str;
|
||||
|
||||
///
|
||||
/// Incremental, zero-copy UTF-8 decoding with error handling
|
||||
///
|
||||
/// The original implementation was written by Simon Sapin in the utf-8 crate <https://crates.io/crates/utf-8>.
|
||||
/// uu_wc used to depend on that crate.
|
||||
/// The author archived the repository <https://github.com/SimonSapin/rust-utf8>.
|
||||
/// They suggested incorporating the source directly into uu_wc <https://github.com/uutils/coreutils/issues/4289>.
|
||||
///
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct Incomplete {
|
||||
pub buffer: [u8; 4],
|
||||
pub buffer_len: u8,
|
||||
}
|
||||
|
||||
impl Incomplete {
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
buffer: [0, 0, 0, 0],
|
||||
buffer_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.buffer_len == 0
|
||||
}
|
||||
|
||||
pub fn new(bytes: &[u8]) -> Self {
|
||||
let mut buffer = [0, 0, 0, 0];
|
||||
let len = bytes.len();
|
||||
buffer[..len].copy_from_slice(bytes);
|
||||
Self {
|
||||
buffer,
|
||||
buffer_len: len as u8,
|
||||
}
|
||||
}
|
||||
|
||||
fn take_buffer(&mut self) -> &[u8] {
|
||||
let len = self.buffer_len as usize;
|
||||
self.buffer_len = 0;
|
||||
&self.buffer[..len]
|
||||
}
|
||||
|
||||
/// (consumed_from_input, None): not enough input
|
||||
/// (consumed_from_input, Some(Err(()))): error bytes in buffer
|
||||
/// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
|
||||
fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
|
||||
let initial_buffer_len = self.buffer_len as usize;
|
||||
let copied_from_input;
|
||||
{
|
||||
let unwritten = &mut self.buffer[initial_buffer_len..];
|
||||
copied_from_input = cmp::min(unwritten.len(), input.len());
|
||||
unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
|
||||
}
|
||||
let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
|
||||
match str::from_utf8(spliced) {
|
||||
Ok(_) => {
|
||||
self.buffer_len = spliced.len() as u8;
|
||||
(copied_from_input, Some(Ok(())))
|
||||
}
|
||||
Err(error) => {
|
||||
let valid_up_to = error.valid_up_to();
|
||||
if valid_up_to > 0 {
|
||||
let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
|
||||
self.buffer_len = valid_up_to as u8;
|
||||
(consumed, Some(Ok(())))
|
||||
} else {
|
||||
match error.error_len() {
|
||||
Some(invalid_sequence_length) => {
|
||||
let consumed = invalid_sequence_length
|
||||
.checked_sub(initial_buffer_len)
|
||||
.unwrap();
|
||||
self.buffer_len = invalid_sequence_length as u8;
|
||||
(consumed, Some(Err(())))
|
||||
}
|
||||
None => {
|
||||
self.buffer_len = spliced.len() as u8;
|
||||
(copied_from_input, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
137
src/uu/wc/src/utf8/read.rs
Normal file
137
src/uu/wc/src/utf8/read.rs
Normal file
|
@ -0,0 +1,137 @@
|
|||
// spell-checker:ignore bytestream
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufRead};
|
||||
use std::str;
|
||||
|
||||
/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
|
||||
pub struct BufReadDecoder<B: BufRead> {
|
||||
buf_read: B,
|
||||
bytes_consumed: usize,
|
||||
incomplete: Incomplete,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum BufReadDecoderError<'a> {
|
||||
/// Represents one UTF-8 error in the byte stream.
|
||||
///
|
||||
/// In lossy decoding, each such error should be replaced with U+FFFD.
|
||||
/// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
|
||||
InvalidByteSequence(&'a [u8]),
|
||||
|
||||
/// An I/O error from the underlying byte stream
|
||||
Io(io::Error),
|
||||
}
|
||||
|
||||
impl<'a> fmt::Display for BufReadDecoderError<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
BufReadDecoderError::InvalidByteSequence(bytes) => {
|
||||
write!(f, "invalid byte sequence: {:02x?}", bytes)
|
||||
}
|
||||
BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Error for BufReadDecoderError<'a> {
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
match *self {
|
||||
BufReadDecoderError::InvalidByteSequence(_) => None,
|
||||
BufReadDecoderError::Io(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: BufRead> BufReadDecoder<B> {
|
||||
pub fn new(buf_read: B) -> Self {
|
||||
Self {
|
||||
buf_read,
|
||||
bytes_consumed: 0,
|
||||
incomplete: Incomplete::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode and consume the next chunk of UTF-8 input.
|
||||
///
|
||||
/// This method is intended to be called repeatedly until it returns `None`,
|
||||
/// which represents EOF from the underlying byte stream.
|
||||
/// This is similar to `Iterator::next`,
|
||||
/// except that decoded chunks borrow the decoder (~iterator)
|
||||
/// so they need to be handled or copied before the next chunk can start decoding.
|
||||
pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
|
||||
enum BytesSource {
|
||||
BufRead(usize),
|
||||
Incomplete,
|
||||
}
|
||||
macro_rules! try_io {
|
||||
($io_result: expr) => {
|
||||
match $io_result {
|
||||
Ok(value) => value,
|
||||
Err(error) => return Some(Err(BufReadDecoderError::Io(error))),
|
||||
}
|
||||
};
|
||||
}
|
||||
let (source, result) = loop {
|
||||
if self.bytes_consumed > 0 {
|
||||
self.buf_read.consume(self.bytes_consumed);
|
||||
self.bytes_consumed = 0;
|
||||
}
|
||||
let buf = try_io!(self.buf_read.fill_buf());
|
||||
|
||||
// Force loop iteration to go through an explicit `continue`
|
||||
enum Unreachable {}
|
||||
let _: Unreachable = if self.incomplete.is_empty() {
|
||||
if buf.is_empty() {
|
||||
return None; // EOF
|
||||
}
|
||||
match str::from_utf8(buf) {
|
||||
Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())),
|
||||
Err(error) => {
|
||||
let valid_up_to = error.valid_up_to();
|
||||
if valid_up_to > 0 {
|
||||
break (BytesSource::BufRead(valid_up_to), Ok(()));
|
||||
}
|
||||
match error.error_len() {
|
||||
Some(invalid_sequence_length) => {
|
||||
break (BytesSource::BufRead(invalid_sequence_length), Err(()))
|
||||
}
|
||||
None => {
|
||||
self.bytes_consumed = buf.len();
|
||||
self.incomplete = Incomplete::new(buf);
|
||||
// need more input bytes
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if buf.is_empty() {
|
||||
break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point
|
||||
}
|
||||
let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
|
||||
self.bytes_consumed = consumed;
|
||||
match opt_result {
|
||||
None => {
|
||||
// need more input bytes
|
||||
continue;
|
||||
}
|
||||
Some(result) => break (BytesSource::Incomplete, result),
|
||||
}
|
||||
};
|
||||
};
|
||||
let bytes = match source {
|
||||
BytesSource::BufRead(byte_count) => {
|
||||
self.bytes_consumed = byte_count;
|
||||
let buf = try_io!(self.buf_read.fill_buf());
|
||||
&buf[..byte_count]
|
||||
}
|
||||
BytesSource::Incomplete => self.incomplete.take_buffer(),
|
||||
};
|
||||
match result {
|
||||
Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
|
||||
Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
mod count_fast;
|
||||
mod countable;
|
||||
mod utf8;
|
||||
mod word_count;
|
||||
use clap::builder::ValueParser;
|
||||
use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};
|
||||
|
|
Loading…
Reference in a new issue