From e8ee0b7b2b9f004c51f7a10eeff0fbfe79af28d8 Mon Sep 17 00:00:00 2001 From: dylni <46035563+dylni@users.noreply.github.com> Date: Sat, 20 Aug 2022 12:49:20 -0400 Subject: [PATCH] Expose `Utf8Lossy` as `Utf8Chunks` --- library/alloc/src/lib.rs | 1 + library/alloc/src/str.rs | 2 + library/alloc/src/string.rs | 16 +- library/core/src/str/lossy.rs | 244 +++++++++++++++-------- library/core/src/str/mod.rs | 6 +- library/core/tests/lib.rs | 1 + library/core/tests/str_lossy.rs | 138 ++++++------- library/std/src/lib.rs | 1 + library/std/src/sys/unix/os_str.rs | 40 ++-- library/std/src/sys/unix/os_str/tests.rs | 8 + 10 files changed, 273 insertions(+), 184 deletions(-) diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 315469387e5..dfff2fb691b 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -141,6 +141,7 @@ #![feature(unchecked_math)] #![feature(unicode_internals)] #![feature(unsize)] +#![feature(utf8_chunks)] #![feature(std_internals)] // // Language features: diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index d5ed2c4adf4..b94b1b1ce21 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -71,6 +71,8 @@ pub use core::str::{RSplitN, SplitN}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::{RSplitTerminator, SplitTerminator}; +#[unstable(feature = "utf8_chunks", issue = "99543")] +pub use core::str::{Utf8Chunk, Utf8Chunks}; /// Note: `str` in `Concat` is not meaningful here. /// This type parameter of the trait only exists to enable another impl. diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index b1513e5e0f3..8eb030369a1 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -58,9 +58,9 @@ use core::ops::{self, Index, IndexMut, Range, RangeBounds}; use core::ptr; use core::slice; -#[cfg(not(no_global_oom_handling))] -use core::str::lossy; use core::str::pattern::Pattern; +#[cfg(not(no_global_oom_handling))] +use core::str::Utf8Chunks; #[cfg(not(no_global_oom_handling))] use crate::borrow::{Cow, ToOwned}; @@ -628,11 +628,11 @@ pub fn from_utf8(vec: Vec) -> Result { #[cfg(not(no_global_oom_handling))] #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> { - let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks(); + let mut iter = Utf8Chunks::new(v); let first_valid = if let Some(chunk) = iter.next() { - let lossy::Utf8LossyChunk { valid, broken } = chunk; - if broken.is_empty() { + let valid = chunk.valid(); + if chunk.invalid().is_empty() { debug_assert_eq!(valid.len(), v.len()); return Cow::Borrowed(valid); } @@ -647,9 +647,9 @@ pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> { res.push_str(first_valid); res.push_str(REPLACEMENT); - for lossy::Utf8LossyChunk { valid, broken } in iter { - res.push_str(valid); - if !broken.is_empty() { + for chunk in iter { + res.push_str(chunk.valid()); + if !chunk.invalid().is_empty() { res.push_str(REPLACEMENT); } } diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 6ec1c93908f..59f873d1268 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -1,51 +1,170 @@ -use crate::char; -use crate::fmt::{self, Write}; -use crate::mem; +use crate::fmt; +use crate::fmt::Formatter; +use crate::fmt::Write; +use crate::iter::FusedIterator; use super::from_utf8_unchecked; use super::validations::utf8_char_width; -/// Lossy UTF-8 string. -#[unstable(feature = "str_internals", issue = "none")] -pub struct Utf8Lossy { - bytes: [u8], +/// An item returned by the [`Utf8Chunks`] iterator. +/// +/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character +/// when decoding a UTF-8 string. +/// +/// # Examples +/// +/// ``` +/// #![feature(utf8_chunks)] +/// +/// use std::str::Utf8Chunks; +/// +/// // An invalid UTF-8 string +/// let bytes = b"foo\xF1\x80bar"; +/// +/// // Decode the first `Utf8Chunk` +/// let chunk = Utf8Chunks::new(bytes).next().unwrap(); +/// +/// // The first three characters are valid UTF-8 +/// assert_eq!("foo", chunk.valid()); +/// +/// // The fourth character is broken +/// assert_eq!(b"\xF1\x80", chunk.invalid()); +/// ``` +#[unstable(feature = "utf8_chunks", issue = "99543")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Utf8Chunk<'a> { + valid: &'a str, + invalid: &'a [u8], } -impl Utf8Lossy { +impl<'a> Utf8Chunk<'a> { + /// Returns the next validated UTF-8 substring. + /// + /// This substring can be empty at the start of the string or between + /// broken UTF-8 characters. #[must_use] - pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { - // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. - unsafe { mem::transmute(bytes) } + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn valid(&self) -> &'a str { + self.valid } - pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { - Utf8LossyChunksIter { source: &self.bytes } + /// Returns the invalid sequence that caused a failure. + /// + /// The returned slice will have a maximum length of 3 and starts after the + /// substring given by [`valid`]. Decoding will resume after this sequence. + /// + /// If empty, this is the last chunk in the string. If non-empty, an + /// unexpected byte was encountered or the end of the input was reached + /// unexpectedly. + /// + /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT + /// CHARACTER`]. + /// + /// [`valid`]: Self::valid + /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER + #[must_use] + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn invalid(&self) -> &'a [u8] { + self.invalid } } -/// Iterator over lossy UTF-8 string -#[must_use = "iterators are lazy and do nothing unless consumed"] +#[must_use] #[unstable(feature = "str_internals", issue = "none")] -#[allow(missing_debug_implementations)] -pub struct Utf8LossyChunksIter<'a> { +pub struct Debug<'a>(&'a [u8]); + +#[unstable(feature = "str_internals", issue = "none")] +impl fmt::Debug for Debug<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + + for chunk in Utf8Chunks::new(self.0) { + // Valid part. + // Here we partially parse UTF-8 again which is suboptimal. + { + let valid = chunk.valid(); + let mut from = 0; + for (i, c) in valid.char_indices() { + let esc = c.escape_debug(); + // If char needs escaping, flush backlog so far and write, else skip + if esc.len() != 1 { + f.write_str(&valid[from..i])?; + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + f.write_str(&valid[from..])?; + } + + // Broken parts of string as hex escape. + for &b in chunk.invalid() { + write!(f, "\\x{:02X}", b)?; + } + } + + f.write_char('"') + } +} + +/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices +/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). +/// +/// If you want a simple conversion from UTF-8 byte slices to string slices, +/// [`from_utf8`] is easier to use. +/// +/// [byteslice]: slice +/// [`from_utf8`]: super::from_utf8 +/// +/// # Examples +/// +/// This can be used to create functionality similar to +/// [`String::from_utf8_lossy`] without allocating heap memory: +/// +/// ``` +/// #![feature(utf8_chunks)] +/// +/// use std::str::Utf8Chunks; +/// +/// fn from_utf8_lossy(input: &[u8], mut push: F) where F: FnMut(&str) { +/// for chunk in Utf8Chunks::new(input) { +/// push(chunk.valid()); +/// +/// if !chunk.invalid().is_empty() { +/// push("\u{FFFD}"); +/// } +/// } +/// } +/// ``` +/// +/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[unstable(feature = "utf8_chunks", issue = "99543")] +#[derive(Clone)] +pub struct Utf8Chunks<'a> { source: &'a [u8], } -#[unstable(feature = "str_internals", issue = "none")] -#[derive(PartialEq, Eq, Debug)] -pub struct Utf8LossyChunk<'a> { - /// Sequence of valid chars. - /// Can be empty between broken UTF-8 chars. - pub valid: &'a str, - /// Single broken char, empty if none. - /// Empty iff iterator item is last. - pub broken: &'a [u8], +impl<'a> Utf8Chunks<'a> { + /// Creates a new iterator to decode the bytes. + #[unstable(feature = "utf8_chunks", issue = "99543")] + pub fn new(bytes: &'a [u8]) -> Self { + Self { source: bytes } + } + + #[doc(hidden)] + #[unstable(feature = "str_internals", issue = "none")] + pub fn debug(&self) -> Debug<'_> { + Debug(self.source) + } } -impl<'a> Iterator for Utf8LossyChunksIter<'a> { - type Item = Utf8LossyChunk<'a>; +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl<'a> Iterator for Utf8Chunks<'a> { + type Item = Utf8Chunk<'a>; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option> { if self.source.is_empty() { return None; } @@ -130,71 +249,22 @@ fn safe_get(xs: &[u8], i: usize) -> u8 { // SAFETY: `valid_up_to <= i` because it is only ever assigned via // `valid_up_to = i` and `i` only increases. - let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; + let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; - Some(Utf8LossyChunk { + Some(Utf8Chunk { // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. valid: unsafe { from_utf8_unchecked(valid) }, - broken, + invalid, }) } } -impl fmt::Display for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // If we're the empty string then our iterator won't actually yield - // anything, so perform the formatting manually - if self.bytes.is_empty() { - return "".fmt(f); - } +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl FusedIterator for Utf8Chunks<'_> {} - for Utf8LossyChunk { valid, broken } in self.chunks() { - // If we successfully decoded the whole chunk as a valid string then - // we can return a direct formatting of the string which will also - // respect various formatting flags if possible. - if valid.len() == self.bytes.len() { - assert!(broken.is_empty()); - return valid.fmt(f); - } - - f.write_str(valid)?; - if !broken.is_empty() { - f.write_char(char::REPLACEMENT_CHARACTER)?; - } - } - Ok(()) - } -} - -impl fmt::Debug for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_char('"')?; - - for Utf8LossyChunk { valid, broken } in self.chunks() { - // Valid part. - // Here we partially parse UTF-8 again which is suboptimal. - { - let mut from = 0; - for (i, c) in valid.char_indices() { - let esc = c.escape_debug(); - // If char needs escaping, flush backlog so far and write, else skip - if esc.len() != 1 { - f.write_str(&valid[from..i])?; - for c in esc { - f.write_char(c)?; - } - from = i + c.len_utf8(); - } - } - f.write_str(&valid[from..])?; - } - - // Broken parts of string as hex escape. - for &b in broken { - write!(f, "\\x{:02x}", b)?; - } - } - - f.write_char('"') +#[unstable(feature = "utf8_chunks", issue = "99543")] +impl fmt::Debug for Utf8Chunks<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish() } } diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index c4f2e283eb3..5723188db69 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -22,9 +22,9 @@ pub mod pattern; -#[unstable(feature = "str_internals", issue = "none")] -#[allow(missing_docs)] -pub mod lossy; +mod lossy; +#[unstable(feature = "utf8_chunks", issue = "99543")] +pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] pub use converts::{from_utf8, from_utf8_unchecked}; diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs index db94368f6e0..68dc8a87c76 100644 --- a/library/core/tests/lib.rs +++ b/library/core/tests/lib.rs @@ -96,6 +96,7 @@ #![feature(waker_getters)] #![feature(slice_flatten)] #![feature(provide_any)] +#![feature(utf8_chunks)] #![deny(unsafe_op_in_unsafe_fn)] extern crate test; diff --git a/library/core/tests/str_lossy.rs b/library/core/tests/str_lossy.rs index d4b47a4708e..9d3f0b65fdb 100644 --- a/library/core/tests/str_lossy.rs +++ b/library/core/tests/str_lossy.rs @@ -1,85 +1,85 @@ -use core::str::lossy::*; +use core::str::Utf8Chunks; #[test] fn chunks() { - let mut iter = Utf8Lossy::from_bytes(b"hello").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "hello", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); + macro_rules! assert_chunks { + ( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{ + let mut iter = Utf8Chunks::new($string); + $( + let chunk = iter.next().expect("missing chunk"); + assert_eq!($valid, chunk.valid()); + assert_eq!($invalid, chunk.invalid()); + )* + assert_eq!(None, iter.next()); + }}; + } - let mut iter = Utf8Lossy::from_bytes("ศไทย中华Việt Nam".as_bytes()).chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "ศไทย中华Việt Nam", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"Hello\xC2 There\xFF Goodbye").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC2" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xFF" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC0" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xE6\x83" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF5foo\xF5\x80bar").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF5" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF5" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF1foo\xF1\x80bar\xF1\x80\x80baz").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF1" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF1\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF1\x80\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF4foo\xF4\x80bar\xF4\xBFbaz").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF4" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF4\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF4" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF0" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo\u{10000}bar", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); + assert_chunks!(b"hello", ("hello", b"")); + assert_chunks!("ศไทย中华Việt Nam".as_bytes(), ("ศไทย中华Việt Nam", b"")); + assert_chunks!( + b"Hello\xC2 There\xFF Goodbye", + ("Hello", b"\xC2"), + (" There", b"\xFF"), + (" Goodbye", b""), + ); + assert_chunks!( + b"Hello\xC0\x80 There\xE6\x83 Goodbye", + ("Hello", b"\xC0"), + ("", b"\x80"), + (" There", b"\xE6\x83"), + (" Goodbye", b""), + ); + assert_chunks!( + b"\xF5foo\xF5\x80bar", + ("", b"\xF5"), + ("foo", b"\xF5"), + ("", b"\x80"), + ("bar", b""), + ); + assert_chunks!( + b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", + ("", b"\xF1"), + ("foo", b"\xF1\x80"), + ("bar", b"\xF1\x80\x80"), + ("baz", b""), + ); + assert_chunks!( + b"\xF4foo\xF4\x80bar\xF4\xBFbaz", + ("", b"\xF4"), + ("foo", b"\xF4\x80"), + ("bar", b"\xF4"), + ("", b"\xBF"), + ("baz", b""), + ); + assert_chunks!( + b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", + ("", b"\xF0"), + ("", b"\x80"), + ("", b"\x80"), + ("", b"\x80"), + ("foo\u{10000}bar", b""), + ); // surrogates - let mut iter = Utf8Lossy::from_bytes(b"\xED\xA0\x80foo\xED\xBF\xBFbar").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xED" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xA0" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xED" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"" }), iter.next()); - assert_eq!(None, iter.next()); -} - -#[test] -fn display() { - assert_eq!( - "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", - &Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string() + assert_chunks!( + b"\xED\xA0\x80foo\xED\xBF\xBFbar", + ("", b"\xED"), + ("", b"\xA0"), + ("", b"\x80"), + ("foo", b"\xED"), + ("", b"\xBF"), + ("", b"\xBF"), + ("bar", b""), ); } #[test] fn debug() { assert_eq!( - "\"Hello\\xc0\\x80 There\\xe6\\x83 Goodbye\\u{10d4ea}\"", + "\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"", &format!( "{:?}", - Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa") - ) + Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(), + ), ); } diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 9fd426d3ac4..773fbf12582 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -258,6 +258,7 @@ #![feature(staged_api)] #![feature(thread_local)] #![feature(try_blocks)] +#![feature(utf8_chunks)] // // Library features (core): #![feature(array_error_internals)] diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index ccbc182240c..017e2af29d4 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -11,7 +11,7 @@ use crate::sync::Arc; use crate::sys_common::{AsInner, IntoInner}; -use core::str::lossy::{Utf8Lossy, Utf8LossyChunk}; +use core::str::Utf8Chunks; #[cfg(test)] #[path = "../unix/os_str/tests.rs"] @@ -29,26 +29,32 @@ pub struct Slice { } impl fmt::Debug for Slice { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - // Writes out a valid unicode string with the correct escape sequences - - formatter.write_str("\"")?; - for Utf8LossyChunk { valid, broken } in Utf8Lossy::from_bytes(&self.inner).chunks() { - for c in valid.chars().flat_map(|c| c.escape_debug()) { - formatter.write_char(c)? - } - - for b in broken { - write!(formatter, "\\x{:02X}", b)?; - } - } - formatter.write_str("\"") + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f) } } impl fmt::Display for Slice { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(&Utf8Lossy::from_bytes(&self.inner), formatter) + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // If we're the empty string then our iterator won't actually yield + // anything, so perform the formatting manually + if self.inner.is_empty() { + return "".fmt(f); + } + + for chunk in Utf8Chunks::new(&self.inner) { + let valid = chunk.valid(); + // If we successfully decoded the whole chunk as a valid string then + // we can return a direct formatting of the string which will also + // respect various formatting flags if possible. + if chunk.invalid().is_empty() { + return valid.fmt(f); + } + + f.write_str(valid)?; + f.write_char(char::REPLACEMENT_CHARACTER)?; + } + Ok(()) } } diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs index 213277f01f2..22ba0c92350 100644 --- a/library/std/src/sys/unix/os_str/tests.rs +++ b/library/std/src/sys/unix/os_str/tests.rs @@ -8,3 +8,11 @@ fn slice_debug_output() { assert_eq!(output, expected); } + +#[test] +fn display() { + assert_eq!( + "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", + Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(), + ); +}