Auto merge of #123909 - dtolnay:utf8chunks, r=joboet

Stabilize `Utf8Chunks`

Pending FCP in https://github.com/rust-lang/rust/issues/99543.

This PR includes the proposed modification in https://github.com/rust-lang/libs-team/issues/190 as agreed in https://github.com/rust-lang/rust/issues/99543#issuecomment-2050406568.
This commit is contained in:
bors 2024-04-26 17:41:24 +00:00
commit 4d570eea02
9 changed files with 57 additions and 40 deletions

View file

@ -161,7 +161,6 @@
#![feature(tuple_trait)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(utf8_chunks)]
#![feature(vec_pop_if)]
// tidy-alphabetical-end
//

View file

@ -53,7 +53,7 @@
pub use core::str::{RSplitN, SplitN};
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::{RSplitTerminator, SplitTerminator};
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub use core::str::{Utf8Chunk, Utf8Chunks};
/// Note: `str` in `Concat<str>` is not meaningful here.

View file

@ -58,8 +58,6 @@
use core::ptr;
use core::slice;
use core::str::pattern::Pattern;
#[cfg(not(no_global_oom_handling))]
use core::str::Utf8Chunks;
#[cfg(not(no_global_oom_handling))]
use crate::borrow::{Cow, ToOwned};
@ -633,7 +631,7 @@ pub fn from_utf8(vec: Vec<u8>) -> Result<String, FromUtf8Error> {
#[cfg(not(no_global_oom_handling))]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
let mut iter = Utf8Chunks::new(v);
let mut iter = v.utf8_chunks();
let first_valid = if let Some(chunk) = iter.next() {
let valid = chunk.valid();

View file

@ -6,6 +6,46 @@
use super::from_utf8_unchecked;
use super::validations::utf8_char_width;
impl [u8] {
/// Creates an iterator over the contiguous valid UTF-8 ranges of this
/// slice, and the non-UTF-8 fragments in between.
///
/// # Examples
///
/// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
/// code in the form of a C-string literal (`c"..."`).
///
/// ```
/// use std::fmt::Write as _;
///
/// pub fn cstr_literal(bytes: &[u8]) -> String {
/// let mut repr = String::new();
/// repr.push_str("c\"");
/// for chunk in bytes.utf8_chunks() {
/// for ch in chunk.valid().chars() {
/// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
/// write!(repr, "{}", ch.escape_debug()).unwrap();
/// }
/// for byte in chunk.invalid() {
/// write!(repr, "\\x{:02X}", byte).unwrap();
/// }
/// }
/// repr.push('"');
/// repr
/// }
///
/// fn main() {
/// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
/// let expected = stringify!(c"\xFErris the 🦀\u{7}");
/// assert_eq!(lit, expected);
/// }
/// ```
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
Utf8Chunks { source: self }
}
}
/// An item returned by the [`Utf8Chunks`] iterator.
///
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
@ -14,15 +54,11 @@
/// # Examples
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// // An invalid UTF-8 string
/// let bytes = b"foo\xF1\x80bar";
///
/// // Decode the first `Utf8Chunk`
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
/// let chunk = bytes.utf8_chunks().next().unwrap();
///
/// // The first three characters are valid UTF-8
/// assert_eq!("foo", chunk.valid());
@ -30,7 +66,7 @@
/// // The fourth character is broken
/// assert_eq!(b"\xF1\x80", chunk.invalid());
/// ```
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Utf8Chunk<'a> {
valid: &'a str,
@ -43,7 +79,7 @@ impl<'a> Utf8Chunk<'a> {
/// This substring can be empty at the start of the string or between
/// broken UTF-8 characters.
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn valid(&self) -> &'a str {
self.valid
}
@ -63,7 +99,7 @@ pub fn valid(&self) -> &'a str {
/// [`valid`]: Self::valid
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn invalid(&self) -> &'a [u8] {
self.invalid
}
@ -78,7 +114,7 @@ impl fmt::Debug for Debug<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_char('"')?;
for chunk in Utf8Chunks::new(self.0) {
for chunk in self.0.utf8_chunks() {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
{
@ -123,12 +159,8 @@ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
/// [`String::from_utf8_lossy`] without allocating heap memory:
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
/// for chunk in Utf8Chunks::new(input) {
/// for chunk in input.utf8_chunks() {
/// push(chunk.valid());
///
/// if !chunk.invalid().is_empty() {
@ -140,19 +172,13 @@ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
///
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
#[derive(Clone)]
pub struct Utf8Chunks<'a> {
source: &'a [u8],
}
impl<'a> Utf8Chunks<'a> {
/// Creates a new iterator to decode the bytes.
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn new(bytes: &'a [u8]) -> Self {
Self { source: bytes }
}
#[doc(hidden)]
#[unstable(feature = "str_internals", issue = "none")]
pub fn debug(&self) -> Debug<'_> {
@ -160,7 +186,7 @@ pub fn debug(&self) -> Debug<'_> {
}
}
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl<'a> Iterator for Utf8Chunks<'a> {
type Item = Utf8Chunk<'a>;
@ -259,10 +285,10 @@ fn safe_get(xs: &[u8], i: usize) -> u8 {
}
}
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl FusedIterator for Utf8Chunks<'_> {}
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl fmt::Debug for Utf8Chunks<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()

View file

@ -24,7 +24,7 @@
pub mod pattern;
mod lossy;
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub use lossy::{Utf8Chunk, Utf8Chunks};
#[stable(feature = "rust1", since = "1.0.0")]

View file

@ -117,7 +117,6 @@
#![feature(error_generic_member_access)]
#![feature(error_in_core)]
#![feature(trait_upcasting)]
#![feature(utf8_chunks)]
#![feature(is_ascii_octdigit)]
#![feature(get_many_mut)]
#![feature(iter_map_windows)]

View file

@ -1,10 +1,8 @@
use core::str::Utf8Chunks;
#[test]
fn chunks() {
macro_rules! assert_chunks {
( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{
let mut iter = Utf8Chunks::new($string);
let mut iter = $string.utf8_chunks();
$(
let chunk = iter.next().expect("missing chunk");
assert_eq!($valid, chunk.valid());
@ -79,7 +77,7 @@ fn debug() {
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
&format!(
"{:?}",
Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(),
b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa".utf8_chunks().debug(),
),
);
}

View file

@ -314,7 +314,6 @@
#![feature(thread_local)]
#![feature(try_blocks)]
#![feature(type_alias_impl_trait)]
#![feature(utf8_chunks)]
// tidy-alphabetical-end
//
// Library features (core):

View file

@ -11,8 +11,6 @@
use crate::sync::Arc;
use crate::sys_common::{AsInner, IntoInner};
use core::str::Utf8Chunks;
#[cfg(test)]
mod tests;
@ -29,7 +27,7 @@ pub struct Slice {
impl fmt::Debug for Slice {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f)
fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f)
}
}
@ -41,7 +39,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
return "".fmt(f);
}
for chunk in Utf8Chunks::new(&self.inner) {
for chunk in self.inner.utf8_chunks() {
let valid = chunk.valid();
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also