Expose Utf8Lossy as Utf8Chunks

dylni 2022-08-20 12:49:20 -04:00
10 changed files with 273 additions and 184 deletions

// Language features:

pub use core::str::{RSplitN, SplitN};
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::{RSplitTerminator, SplitTerminator};
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub use core::str::{Utf8Chunk, Utf8Chunks};
/// Note: `str` in `Concat<str>` is not meaningful here.
/// This type parameter of the trait only exists to enable another impl.

use core::ops::{self, Index, IndexMut, Range, RangeBounds};
use core::ptr;
use core::slice;
use core::str::lossy;
use core::str::pattern::Pattern;
use core::str::Utf8Chunks;
use crate::borrow::{Cow, ToOwned};
@ -628,11 +628,11 @@ pub fn from_utf8(vec: Vec<u8>) -> Result<String, FromUtf8Error> {
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks();
let mut iter = Utf8Chunks::new(v);
let first_valid = if let Some(chunk) = iter.next() {
let lossy::Utf8LossyChunk { valid, broken } = chunk;
if broken.is_empty() {
let valid = chunk.valid();
if chunk.invalid().is_empty() {
debug_assert_eq!(valid.len(), v.len());
return Cow::Borrowed(valid);
@ -647,9 +647,9 @@ pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
for lossy::Utf8LossyChunk { valid, broken } in iter {
if !broken.is_empty() {
for chunk in iter {
if !chunk.invalid().is_empty() {

use crate::char;
use crate::fmt::{self, Write};
use crate::mem;
use crate::fmt;
use crate::fmt::Formatter;
use crate::fmt::Write;
use crate::iter::FusedIterator;
use super::from_utf8_unchecked;
use super::validations::utf8_char_width;
/// Lossy UTF-8 string.
#[unstable(feature = "str_internals", issue = "none")]
pub struct Utf8Lossy {
bytes: [u8],
/// An item returned by the [`Utf8Chunks`] iterator.
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
/// when decoding a UTF-8 string.
/// # Examples
/// ```
/// #![feature(utf8_chunks)]
/// use std::str::Utf8Chunks;
/// // An invalid UTF-8 string
/// let bytes = b"foo\xF1\x80bar";
/// // Decode the first `Utf8Chunk`
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
/// // The first three characters are valid UTF-8
/// assert_eq!("foo", chunk.valid());
/// // The fourth character is broken
/// assert_eq!(b"\xF1\x80", chunk.invalid());
/// ```
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Utf8Chunk<'a> {
valid: &'a str,
invalid: &'a [u8],
impl Utf8Lossy {
impl<'a> Utf8Chunk<'a> {
/// Returns the next validated UTF-8 substring.
/// This substring can be empty at the start of the string or between
/// broken UTF-8 characters.
pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
// SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
unsafe { mem::transmute(bytes) }
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn valid(&self) -> &'a str {
pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
Utf8LossyChunksIter { source: &self.bytes }
/// Returns the invalid sequence that caused a failure.
/// The returned slice will have a maximum length of 3 and starts after the
/// substring given by [`valid`]. Decoding will resume after this sequence.
/// If empty, this is the last chunk in the string. If non-empty, an
/// unexpected byte was encountered or the end of the input was reached
/// unexpectedly.
/// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
/// [`valid`]: Self::valid
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn invalid(&self) -> &'a [u8] {
/// Iterator over lossy UTF-8 string
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[unstable(feature = "str_internals", issue = "none")]
pub struct Utf8LossyChunksIter<'a> {
pub struct Debug<'a>(&'a [u8]);
#[unstable(feature = "str_internals", issue = "none")]
impl fmt::Debug for Debug<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
for chunk in Utf8Chunks::new(self.0) {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
let valid = chunk.valid();
let mut from = 0;
for (i, c) in valid.char_indices() {
let esc = c.escape_debug();
// If char needs escaping, flush backlog so far and write, else skip
if esc.len() != 1 {
for c in esc {
from = i + c.len_utf8();
// Broken parts of string as hex escape.
for &b in chunk.invalid() {
write!(f, "\\x{:02X}", b)?;
/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
/// If you want a simple conversion from UTF-8 byte slices to string slices,
/// [`from_utf8`] is easier to use.
/// [byteslice]: slice
/// [`from_utf8`]: super::from_utf8
/// # Examples
/// This can be used to create functionality similar to
/// [`String::from_utf8_lossy`] without allocating heap memory:
/// ```
/// #![feature(utf8_chunks)]
/// use std::str::Utf8Chunks;
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
/// for chunk in Utf8Chunks::new(input) {
/// push(chunk.valid());
/// if !chunk.invalid().is_empty() {
/// push("\u{FFFD}");
/// }
/// }
/// }
/// ```
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub struct Utf8Chunks<'a> {
source: &'a [u8],
#[unstable(feature = "str_internals", issue = "none")]
#[derive(PartialEq, Eq, Debug)]
pub struct Utf8LossyChunk<'a> {
/// Sequence of valid chars.
/// Can be empty between broken UTF-8 chars.
pub valid: &'a str,
/// Single broken char, empty if none.
/// Empty iff iterator item is last.
pub broken: &'a [u8],
impl<'a> Utf8Chunks<'a> {
/// Creates a new iterator to decode the bytes.
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn new(bytes: &'a [u8]) -> Self {
Self { source: bytes }
#[unstable(feature = "str_internals", issue = "none")]
pub fn debug(&self) -> Debug<'_> {
impl<'a> Iterator for Utf8LossyChunksIter<'a> {
type Item = Utf8LossyChunk<'a>;
#[unstable(feature = "utf8_chunks", issue = "99543")]
impl<'a> Iterator for Utf8Chunks<'a> {
type Item = Utf8Chunk<'a>;
fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
fn next(&mut self) -> Option<Utf8Chunk<'a>> {
if self.source.is_empty() {
return None;
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
// `valid_up_to = i` and `i` only increases.
let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
Some(Utf8LossyChunk {
Some(Utf8Chunk {
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
valid: unsafe { from_utf8_unchecked(valid) },
impl fmt::Display for Utf8Lossy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// If we're the empty string then our iterator won't actually yield
// anything, so perform the formatting manually
if self.bytes.is_empty() {
return "".fmt(f);
#[unstable(feature = "utf8_chunks", issue = "99543")]
impl FusedIterator for Utf8Chunks<'_> {}
for Utf8LossyChunk { valid, broken } in self.chunks() {
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also
// respect various formatting flags if possible.
if valid.len() == self.bytes.len() {
return valid.fmt(f);
if !broken.is_empty() {
impl fmt::Debug for Utf8Lossy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for Utf8LossyChunk { valid, broken } in self.chunks() {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
let mut from = 0;
for (i, c) in valid.char_indices() {
let esc = c.escape_debug();
// If char needs escaping, flush backlog so far and write, else skip
if esc.len() != 1 {
for c in esc {
from = i + c.len_utf8();
// Broken parts of string as hex escape.
for &b in broken {
write!(f, "\\x{:02x}", b)?;
#[unstable(feature = "utf8_chunks", issue = "99543")]
impl fmt::Debug for Utf8Chunks<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()

pub mod pattern;
#[unstable(feature = "str_internals", issue = "none")]
pub mod lossy;
mod lossy;
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub use lossy::{Utf8Chunk, Utf8Chunks};
#[stable(feature = "rust1", since = "1.0.0")]
pub use converts::{from_utf8, from_utf8_unchecked};

extern crate test;

use core::str::lossy::*;
use core::str::Utf8Chunks;
fn chunks() {
let mut iter = Utf8Lossy::from_bytes(b"hello").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "hello", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
macro_rules! assert_chunks {
( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{
let mut iter = Utf8Chunks::new($string);
let chunk = iter.next().expect("missing chunk");
assert_eq!($valid, chunk.valid());
assert_eq!($invalid, chunk.invalid());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes("ศไทย中华Việt Nam".as_bytes()).chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "ศไทย中华Việt Nam", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes(b"Hello\xC2 There\xFF Goodbye").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC2" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xFF" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC0" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xE6\x83" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes(b"\xF5foo\xF5\x80bar").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF5" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF5" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes(b"\xF1foo\xF1\x80bar\xF1\x80\x80baz").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF1" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF1\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF1\x80\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes(b"\xF4foo\xF4\x80bar\xF4\xBFbaz").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF4" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF4\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF4" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
let mut iter = Utf8Lossy::from_bytes(b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF0" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "foo\u{10000}bar", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
assert_chunks!(b"hello", ("hello", b""));
assert_chunks!("ศไทย中华Việt Nam".as_bytes(), ("ศไทย中华Việt Nam", b""));
b"Hello\xC2 There\xFF Goodbye",
("Hello", b"\xC2"),
(" There", b"\xFF"),
(" Goodbye", b""),
b"Hello\xC0\x80 There\xE6\x83 Goodbye",
("Hello", b"\xC0"),
("", b"\x80"),
(" There", b"\xE6\x83"),
(" Goodbye", b""),
("", b"\xF5"),
("foo", b"\xF5"),
("", b"\x80"),
("bar", b""),
("", b"\xF1"),
("foo", b"\xF1\x80"),
("bar", b"\xF1\x80\x80"),
("baz", b""),
("", b"\xF4"),
("foo", b"\xF4\x80"),
("bar", b"\xF4"),
("", b"\xBF"),
("baz", b""),
("", b"\xF0"),
("", b"\x80"),
("", b"\x80"),
("", b"\x80"),
("foo\u{10000}bar", b""),
// surrogates
let mut iter = Utf8Lossy::from_bytes(b"\xED\xA0\x80foo\xED\xBF\xBFbar").chunks();
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xED" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xA0" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xED" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next());
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"" }), iter.next());
assert_eq!(None, iter.next());
fn display() {
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
&Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
("", b"\xED"),
("", b"\xA0"),
("", b"\x80"),
("foo", b"\xED"),
("", b"\xBF"),
("", b"\xBF"),
("bar", b""),
fn debug() {
"\"Hello\\xc0\\x80 There\\xe6\\x83 Goodbye\\u{10d4ea}\"",
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa")
Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(),

// Library features (core):

use crate::sync::Arc;
use crate::sys_common::{AsInner, IntoInner};
use core::str::lossy::{Utf8Lossy, Utf8LossyChunk};
use core::str::Utf8Chunks;
#[path = "../unix/os_str/tests.rs"]
@ -29,26 +29,32 @@ pub struct Slice {
impl fmt::Debug for Slice {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
// Writes out a valid unicode string with the correct escape sequences
for Utf8LossyChunk { valid, broken } in Utf8Lossy::from_bytes(&self.inner).chunks() {
for c in valid.chars().flat_map(|c| c.escape_debug()) {
for b in broken {
write!(formatter, "\\x{:02X}", b)?;
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f)
impl fmt::Display for Slice {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&Utf8Lossy::from_bytes(&self.inner), formatter)
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// If we're the empty string then our iterator won't actually yield
// anything, so perform the formatting manually
if self.inner.is_empty() {
return "".fmt(f);
for chunk in Utf8Chunks::new(&self.inner) {
let valid = chunk.valid();
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also
// respect various formatting flags if possible.
if chunk.invalid().is_empty() {
return valid.fmt(f);

assert_eq!(output, expected);
fn display() {
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),