Copy the UTF8 crate in the tree and remove utf8 dependency. (#4460)

2024-07-22 18:34:12 +00:00 · 2023-04-14 12:31:11 -07:00 · 2023-04-14 12:31:11 -07:00 · 084510e499
parent 6a54d820ac
commit 084510e499
6 changed files with 257 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2350,12 +2350,6 @@ dependencies = [
 "log",
 ]

-[[package]]
-name = "utf-8"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
-
 [[package]]
 name = "utf8parse"
 version = "0.2.1"
@ -3330,7 +3324,6 @@ dependencies = [
 "libc",
 "nix",
 "unicode-width",
- "utf-8",
 "uucore",
 ]

--- a/src/uu/wc/Cargo.toml
+++ b/src/uu/wc/Cargo.toml
@ -18,7 +18,6 @@ path = "src/wc.rs"
 clap = { workspace=true }
 uucore = { workspace=true, features=["pipes"] }
 bytecount = { workspace=true }
-utf-8 = { workspace=true }
 unicode-width = { workspace=true }

 [target.'cfg(unix)'.dependencies]
--- a/src/uu/wc/src/utf8/LICENSE
+++ b/src/uu/wc/src/utf8/LICENSE
@ -0,0 +1,26 @@
+// spell-checker:ignore Sapin
+Copyright (c) Simon Sapin and many others
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/src/uu/wc/src/utf8/mod.rs
+++ b/src/uu/wc/src/utf8/mod.rs
@ -0,0 +1,93 @@
+// spell-checker:ignore Sapin
+mod read;
+
+pub use read::{BufReadDecoder, BufReadDecoderError};
+
+use std::cmp;
+use std::str;
+
+///
+/// Incremental, zero-copy UTF-8 decoding with error handling
+///
+/// The original implementation was written by Simon Sapin in the utf-8 crate <https://crates.io/crates/utf-8>.
+/// uu_wc used to depend on that crate.
+/// The author archived the repository <https://github.com/SimonSapin/rust-utf8>.
+/// They suggested incorporating the source directly into uu_wc <https://github.com/uutils/coreutils/issues/4289>.
+///
+
+#[derive(Debug, Copy, Clone)]
+pub struct Incomplete {
+    pub buffer: [u8; 4],
+    pub buffer_len: u8,
+}
+
+impl Incomplete {
+    pub fn empty() -> Self {
+        Self {
+            buffer: [0, 0, 0, 0],
+            buffer_len: 0,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.buffer_len == 0
+    }
+
+    pub fn new(bytes: &[u8]) -> Self {
+        let mut buffer = [0, 0, 0, 0];
+        let len = bytes.len();
+        buffer[..len].copy_from_slice(bytes);
+        Self {
+            buffer,
+            buffer_len: len as u8,
+        }
+    }
+
+    fn take_buffer(&mut self) -> &[u8] {
+        let len = self.buffer_len as usize;
+        self.buffer_len = 0;
+        &self.buffer[..len]
+    }
+
+    /// (consumed_from_input, None): not enough input
+    /// (consumed_from_input, Some(Err(()))): error bytes in buffer
+    /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
+    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
+        let initial_buffer_len = self.buffer_len as usize;
+        let copied_from_input;
+        {
+            let unwritten = &mut self.buffer[initial_buffer_len..];
+            copied_from_input = cmp::min(unwritten.len(), input.len());
+            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
+        }
+        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
+        match str::from_utf8(spliced) {
+            Ok(_) => {
+                self.buffer_len = spliced.len() as u8;
+                (copied_from_input, Some(Ok(())))
+            }
+            Err(error) => {
+                let valid_up_to = error.valid_up_to();
+                if valid_up_to > 0 {
+                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
+                    self.buffer_len = valid_up_to as u8;
+                    (consumed, Some(Ok(())))
+                } else {
+                    match error.error_len() {
+                        Some(invalid_sequence_length) => {
+                            let consumed = invalid_sequence_length
+                                .checked_sub(initial_buffer_len)
+                                .unwrap();
+                            self.buffer_len = invalid_sequence_length as u8;
+                            (consumed, Some(Err(())))
+                        }
+                        None => {
+                            self.buffer_len = spliced.len() as u8;
+                            (copied_from_input, None)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/uu/wc/src/utf8/read.rs
+++ b/src/uu/wc/src/utf8/read.rs
@ -0,0 +1,137 @@
+// spell-checker:ignore bytestream
+use super::*;
+use std::error::Error;
+use std::fmt;
+use std::io::{self, BufRead};
+use std::str;
+
+/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
+pub struct BufReadDecoder<B: BufRead> {
+    buf_read: B,
+    bytes_consumed: usize,
+    incomplete: Incomplete,
+}
+
+#[derive(Debug)]
+pub enum BufReadDecoderError<'a> {
+    /// Represents one UTF-8 error in the byte stream.
+    ///
+    /// In lossy decoding, each such error should be replaced with U+FFFD.
+    /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
+    InvalidByteSequence(&'a [u8]),
+
+    /// An I/O error from the underlying byte stream
+    Io(io::Error),
+}
+
+impl<'a> fmt::Display for BufReadDecoderError<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            BufReadDecoderError::InvalidByteSequence(bytes) => {
+                write!(f, "invalid byte sequence: {:02x?}", bytes)
+            }
+            BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
+        }
+    }
+}
+
+impl<'a> Error for BufReadDecoderError<'a> {
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        match *self {
+            BufReadDecoderError::InvalidByteSequence(_) => None,
+            BufReadDecoderError::Io(ref err) => Some(err),
+        }
+    }
+}
+
+impl<B: BufRead> BufReadDecoder<B> {
+    pub fn new(buf_read: B) -> Self {
+        Self {
+            buf_read,
+            bytes_consumed: 0,
+            incomplete: Incomplete::empty(),
+        }
+    }
+
+    /// Decode and consume the next chunk of UTF-8 input.
+    ///
+    /// This method is intended to be called repeatedly until it returns `None`,
+    /// which represents EOF from the underlying byte stream.
+    /// This is similar to `Iterator::next`,
+    /// except that decoded chunks borrow the decoder (~iterator)
+    /// so they need to be handled or copied before the next chunk can start decoding.
+    pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
+        enum BytesSource {
+            BufRead(usize),
+            Incomplete,
+        }
+        macro_rules! try_io {
+            ($io_result: expr) => {
+                match $io_result {
+                    Ok(value) => value,
+                    Err(error) => return Some(Err(BufReadDecoderError::Io(error))),
+                }
+            };
+        }
+        let (source, result) = loop {
+            if self.bytes_consumed > 0 {
+                self.buf_read.consume(self.bytes_consumed);
+                self.bytes_consumed = 0;
+            }
+            let buf = try_io!(self.buf_read.fill_buf());
+
+            // Force loop iteration to go through an explicit `continue`
+            enum Unreachable {}
+            let _: Unreachable = if self.incomplete.is_empty() {
+                if buf.is_empty() {
+                    return None; // EOF
+                }
+                match str::from_utf8(buf) {
+                    Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())),
+                    Err(error) => {
+                        let valid_up_to = error.valid_up_to();
+                        if valid_up_to > 0 {
+                            break (BytesSource::BufRead(valid_up_to), Ok(()));
+                        }
+                        match error.error_len() {
+                            Some(invalid_sequence_length) => {
+                                break (BytesSource::BufRead(invalid_sequence_length), Err(()))
+                            }
+                            None => {
+                                self.bytes_consumed = buf.len();
+                                self.incomplete = Incomplete::new(buf);
+                                // need more input bytes
+                                continue;
+                            }
+                        }
+                    }
+                }
+            } else {
+                if buf.is_empty() {
+                    break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point
+                }
+                let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
+                self.bytes_consumed = consumed;
+                match opt_result {
+                    None => {
+                        // need more input bytes
+                        continue;
+                    }
+                    Some(result) => break (BytesSource::Incomplete, result),
+                }
+            };
+        };
+        let bytes = match source {
+            BytesSource::BufRead(byte_count) => {
+                self.bytes_consumed = byte_count;
+                let buf = try_io!(self.buf_read.fill_buf());
+                &buf[..byte_count]
+            }
+            BytesSource::Incomplete => self.incomplete.take_buffer(),
+        };
+        match result {
+            Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
+            Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
+        }
+    }
+}
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@ -9,6 +9,7 @@

 mod count_fast;
 mod countable;
+mod utf8;
 mod word_count;
 use clap::builder::ValueParser;
 use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};