mirror of
https://github.com/rust-lang/rust
synced 2024-10-14 12:33:57 +00:00
Stabilize Utf8Chunks
This commit is contained in:
parent
7ab5eb8fe7
commit
61cf00464e
|
@ -163,7 +163,6 @@
|
|||
#![feature(tuple_trait)]
|
||||
#![feature(unicode_internals)]
|
||||
#![feature(unsize)]
|
||||
#![feature(utf8_chunks)]
|
||||
#![feature(vec_pop_if)]
|
||||
// tidy-alphabetical-end
|
||||
//
|
||||
|
|
|
@ -53,7 +53,7 @@
|
|||
pub use core::str::{RSplitN, SplitN};
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub use core::str::{RSplitTerminator, SplitTerminator};
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
pub use core::str::{Utf8Chunk, Utf8Chunks};
|
||||
|
||||
/// Note: `str` in `Concat<str>` is not meaningful here.
|
||||
|
|
|
@ -58,8 +58,6 @@
|
|||
use core::ptr;
|
||||
use core::slice;
|
||||
use core::str::pattern::Pattern;
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use core::str::Utf8Chunks;
|
||||
|
||||
#[cfg(not(no_global_oom_handling))]
|
||||
use crate::borrow::{Cow, ToOwned};
|
||||
|
@ -633,7 +631,7 @@ pub fn from_utf8(vec: Vec<u8>) -> Result<String, FromUtf8Error> {
|
|||
#[cfg(not(no_global_oom_handling))]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
|
||||
let mut iter = Utf8Chunks::new(v);
|
||||
let mut iter = v.utf8_chunks();
|
||||
|
||||
let first_valid = if let Some(chunk) = iter.next() {
|
||||
let valid = chunk.valid();
|
||||
|
|
|
@ -6,6 +6,46 @@
|
|||
use super::from_utf8_unchecked;
|
||||
use super::validations::utf8_char_width;
|
||||
|
||||
impl [u8] {
|
||||
/// Creates an iterator over the contiguous valid UTF-8 ranges of this
|
||||
/// slice, and the non-UTF-8 fragments in between.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
|
||||
/// code in the form of a C-string literal (`c"..."`).
|
||||
///
|
||||
/// ```
|
||||
/// use std::fmt::Write as _;
|
||||
///
|
||||
/// pub fn cstr_literal(bytes: &[u8]) -> String {
|
||||
/// let mut repr = String::new();
|
||||
/// repr.push_str("c\"");
|
||||
/// for chunk in bytes.utf8_chunks() {
|
||||
/// for ch in chunk.valid().chars() {
|
||||
/// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
|
||||
/// write!(repr, "{}", ch.escape_debug()).unwrap();
|
||||
/// }
|
||||
/// for byte in chunk.invalid() {
|
||||
/// write!(repr, "\\x{:02X}", byte).unwrap();
|
||||
/// }
|
||||
/// }
|
||||
/// repr.push('"');
|
||||
/// repr
|
||||
/// }
|
||||
///
|
||||
/// fn main() {
|
||||
/// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
|
||||
/// let expected = stringify!(c"\xFErris the 🦀\u{7}");
|
||||
/// assert_eq!(lit, expected);
|
||||
/// }
|
||||
/// ```
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
|
||||
Utf8Chunks { source: self }
|
||||
}
|
||||
}
|
||||
|
||||
/// An item returned by the [`Utf8Chunks`] iterator.
|
||||
///
|
||||
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
|
||||
|
@ -14,15 +54,11 @@
|
|||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(utf8_chunks)]
|
||||
///
|
||||
/// use std::str::Utf8Chunks;
|
||||
///
|
||||
/// // An invalid UTF-8 string
|
||||
/// let bytes = b"foo\xF1\x80bar";
|
||||
///
|
||||
/// // Decode the first `Utf8Chunk`
|
||||
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
|
||||
/// let chunk = bytes.utf8_chunks().next().unwrap();
|
||||
///
|
||||
/// // The first three characters are valid UTF-8
|
||||
/// assert_eq!("foo", chunk.valid());
|
||||
|
@ -30,7 +66,7 @@
|
|||
/// // The fourth character is broken
|
||||
/// assert_eq!(b"\xF1\x80", chunk.invalid());
|
||||
/// ```
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct Utf8Chunk<'a> {
|
||||
valid: &'a str,
|
||||
|
@ -43,7 +79,7 @@ impl<'a> Utf8Chunk<'a> {
|
|||
/// This substring can be empty at the start of the string or between
|
||||
/// broken UTF-8 characters.
|
||||
#[must_use]
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
pub fn valid(&self) -> &'a str {
|
||||
self.valid
|
||||
}
|
||||
|
@ -63,7 +99,7 @@ pub fn valid(&self) -> &'a str {
|
|||
/// [`valid`]: Self::valid
|
||||
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
|
||||
#[must_use]
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
pub fn invalid(&self) -> &'a [u8] {
|
||||
self.invalid
|
||||
}
|
||||
|
@ -78,7 +114,7 @@ impl fmt::Debug for Debug<'_> {
|
|||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
f.write_char('"')?;
|
||||
|
||||
for chunk in Utf8Chunks::new(self.0) {
|
||||
for chunk in self.0.utf8_chunks() {
|
||||
// Valid part.
|
||||
// Here we partially parse UTF-8 again which is suboptimal.
|
||||
{
|
||||
|
@ -123,12 +159,8 @@ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
|||
/// [`String::from_utf8_lossy`] without allocating heap memory:
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(utf8_chunks)]
|
||||
///
|
||||
/// use std::str::Utf8Chunks;
|
||||
///
|
||||
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
|
||||
/// for chunk in Utf8Chunks::new(input) {
|
||||
/// for chunk in input.utf8_chunks() {
|
||||
/// push(chunk.valid());
|
||||
///
|
||||
/// if !chunk.invalid().is_empty() {
|
||||
|
@ -140,19 +172,13 @@ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
|||
///
|
||||
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
|
||||
#[must_use = "iterators are lazy and do nothing unless consumed"]
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
#[derive(Clone)]
|
||||
pub struct Utf8Chunks<'a> {
|
||||
source: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> Utf8Chunks<'a> {
|
||||
/// Creates a new iterator to decode the bytes.
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
pub fn new(bytes: &'a [u8]) -> Self {
|
||||
Self { source: bytes }
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
#[unstable(feature = "str_internals", issue = "none")]
|
||||
pub fn debug(&self) -> Debug<'_> {
|
||||
|
@ -160,7 +186,7 @@ pub fn debug(&self) -> Debug<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
impl<'a> Iterator for Utf8Chunks<'a> {
|
||||
type Item = Utf8Chunk<'a>;
|
||||
|
||||
|
@ -259,10 +285,10 @@ fn safe_get(xs: &[u8], i: usize) -> u8 {
|
|||
}
|
||||
}
|
||||
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
impl FusedIterator for Utf8Chunks<'_> {}
|
||||
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
impl fmt::Debug for Utf8Chunks<'_> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
pub mod pattern;
|
||||
|
||||
mod lossy;
|
||||
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
|
||||
pub use lossy::{Utf8Chunk, Utf8Chunks};
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
|
|
|
@ -118,7 +118,6 @@
|
|||
#![feature(error_generic_member_access)]
|
||||
#![feature(error_in_core)]
|
||||
#![feature(trait_upcasting)]
|
||||
#![feature(utf8_chunks)]
|
||||
#![feature(is_ascii_octdigit)]
|
||||
#![feature(get_many_mut)]
|
||||
#![feature(iter_map_windows)]
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
use core::str::Utf8Chunks;
|
||||
|
||||
#[test]
|
||||
fn chunks() {
|
||||
macro_rules! assert_chunks {
|
||||
( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{
|
||||
let mut iter = Utf8Chunks::new($string);
|
||||
let mut iter = $string.utf8_chunks();
|
||||
$(
|
||||
let chunk = iter.next().expect("missing chunk");
|
||||
assert_eq!($valid, chunk.valid());
|
||||
|
@ -79,7 +77,7 @@ fn debug() {
|
|||
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
|
||||
&format!(
|
||||
"{:?}",
|
||||
Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(),
|
||||
b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa".utf8_chunks().debug(),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
|
|
@ -313,7 +313,6 @@
|
|||
#![feature(thread_local)]
|
||||
#![feature(try_blocks)]
|
||||
#![feature(type_alias_impl_trait)]
|
||||
#![feature(utf8_chunks)]
|
||||
// tidy-alphabetical-end
|
||||
//
|
||||
// Library features (core):
|
||||
|
|
|
@ -11,8 +11,6 @@
|
|||
use crate::sync::Arc;
|
||||
use crate::sys_common::{AsInner, IntoInner};
|
||||
|
||||
use core::str::Utf8Chunks;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
|
@ -29,7 +27,7 @@ pub struct Slice {
|
|||
|
||||
impl fmt::Debug for Slice {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f)
|
||||
fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -41,7 +39,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|||
return "".fmt(f);
|
||||
}
|
||||
|
||||
for chunk in Utf8Chunks::new(&self.inner) {
|
||||
for chunk in self.inner.utf8_chunks() {
|
||||
let valid = chunk.valid();
|
||||
// If we successfully decoded the whole chunk as a valid string then
|
||||
// we can return a direct formatting of the string which will also
|
||||
|
|
Loading…
Reference in a new issue