Serialize incr comp structures to file via fixed-size buffer

Reduce a large memory spike that happens during serialization by writing
the incr comp structures to file by way of a fixed-size buffer, rather
than an unbounded vector.

Effort was made to keep the instruction count close to that of the
previous implementation. However, buffered writing to a file inherently
has more overhead than writing to a vector, because each write may
result in a handleable error. To reduce this overhead, arrangements are
made so that each LEB128-encoded integer can be written to the buffer
with only one capacity and error check. Higher-level optimizations in
which entire composite structures can be written with one capacity and
error check are possible, but would require much more work.

The performance is mostly on par with the previous implementation, with
small to moderate instruction count regressions. The memory reduction is
significant, however, so it seems like a worth-while trade-off.
This commit is contained in:
Tyson Nottingham 2020-12-06 17:30:55 -08:00
parent 6526e5c772
commit 52f21791fb
11 changed files with 611 additions and 176 deletions

View file

@ -1,6 +1,6 @@
use crate::stable_hasher;
use rustc_serialize::{
opaque::{self, EncodeResult},
opaque::{self, EncodeResult, FileEncodeResult},
Decodable, Encodable,
};
use std::hash::{Hash, Hasher};
@ -53,13 +53,6 @@ pub fn to_hex(&self) -> String {
format!("{:x}{:x}", self.0, self.1)
}
pub fn encode_opaque(&self, encoder: &mut opaque::Encoder) -> EncodeResult {
let bytes: [u8; 16] = unsafe { mem::transmute([self.0.to_le(), self.1.to_le()]) };
encoder.emit_raw_bytes(&bytes);
Ok(())
}
pub fn decode_opaque(decoder: &mut opaque::Decoder<'_>) -> Result<Fingerprint, String> {
let mut bytes: [MaybeUninit<u8>; 16] = MaybeUninit::uninit_array();
@ -142,7 +135,16 @@ impl<E: rustc_serialize::Encoder> FingerprintEncoder for E {
impl FingerprintEncoder for opaque::Encoder {
fn encode_fingerprint(&mut self, f: &Fingerprint) -> EncodeResult {
f.encode_opaque(self)
let bytes: [u8; 16] = unsafe { mem::transmute([f.0.to_le(), f.1.to_le()]) };
self.emit_raw_bytes(&bytes);
Ok(())
}
}
impl FingerprintEncoder for opaque::FileEncoder {
fn encode_fingerprint(&mut self, f: &Fingerprint) -> FileEncodeResult {
let bytes: [u8; 16] = unsafe { mem::transmute([f.0.to_le(), f.1.to_le()]) };
self.emit_raw_bytes(&bytes)
}
}
@ -198,7 +200,7 @@ fn encode(&self, s: &mut E) -> Result<(), E::Error> {
impl<D: rustc_serialize::Decoder> Decodable<D> for PackedFingerprint {
#[inline]
fn decode(d: &mut D) -> Result<Self, D::Error> {
Fingerprint::decode(d).map(|f| PackedFingerprint(f))
Fingerprint::decode(d).map(PackedFingerprint)
}
}

View file

@ -14,7 +14,7 @@
use std::io::{self, Read};
use std::path::Path;
use rustc_serialize::opaque::Encoder;
use rustc_serialize::opaque::{FileEncodeResult, FileEncoder};
/// The first few bytes of files generated by incremental compilation.
const FILE_MAGIC: &[u8] = b"RSIC";
@ -27,15 +27,17 @@
/// the Git commit hash.
const RUSTC_VERSION: Option<&str> = option_env!("CFG_VERSION");
pub fn write_file_header(stream: &mut Encoder, nightly_build: bool) {
stream.emit_raw_bytes(FILE_MAGIC);
stream
.emit_raw_bytes(&[(HEADER_FORMAT_VERSION >> 0) as u8, (HEADER_FORMAT_VERSION >> 8) as u8]);
pub fn write_file_header(stream: &mut FileEncoder, nightly_build: bool) -> FileEncodeResult {
stream.emit_raw_bytes(FILE_MAGIC)?;
stream.emit_raw_bytes(&[
(HEADER_FORMAT_VERSION >> 0) as u8,
(HEADER_FORMAT_VERSION >> 8) as u8,
])?;
let rustc_version = rustc_version(nightly_build);
assert_eq!(rustc_version.len(), (rustc_version.len() as u8) as usize);
stream.emit_raw_bytes(&[rustc_version.len() as u8]);
stream.emit_raw_bytes(rustc_version.as_bytes());
stream.emit_raw_bytes(&[rustc_version.len() as u8])?;
stream.emit_raw_bytes(rustc_version.as_bytes())
}
/// Reads the contents of a file with a file header as defined in this module.

View file

@ -2,7 +2,7 @@
use rustc_data_structures::sync::join;
use rustc_middle::dep_graph::{DepGraph, DepKind, WorkProduct, WorkProductId};
use rustc_middle::ty::TyCtxt;
use rustc_serialize::opaque::Encoder;
use rustc_serialize::opaque::{FileEncodeResult, FileEncoder};
use rustc_serialize::Encodable as RustcEncodable;
use rustc_session::Session;
use std::fs;
@ -33,12 +33,12 @@ pub fn save_dep_graph(tcx: TyCtxt<'_>) {
join(
move || {
sess.time("incr_comp_persist_result_cache", || {
save_in(sess, query_cache_path, |e| encode_query_cache(tcx, e));
save_in(sess, query_cache_path, "query cache", |e| encode_query_cache(tcx, e));
});
},
|| {
sess.time("incr_comp_persist_dep_graph", || {
save_in(sess, dep_graph_path, |e| {
save_in(sess, dep_graph_path, "dependency graph", |e| {
sess.time("incr_comp_encode_dep_graph", || encode_dep_graph(tcx, e))
});
});
@ -65,7 +65,7 @@ pub fn save_work_product_index(
debug!("save_work_product_index()");
dep_graph.assert_ignored();
let path = work_products_path(sess);
save_in(sess, path, |e| encode_work_product_index(&new_work_products, e));
save_in(sess, path, "work product index", |e| encode_work_product_index(&new_work_products, e));
// We also need to clean out old work-products, as not all of them are
// deleted during invalidation. Some object files don't change their
@ -92,13 +92,13 @@ pub fn save_work_product_index(
});
}
fn save_in<F>(sess: &Session, path_buf: PathBuf, encode: F)
fn save_in<F>(sess: &Session, path_buf: PathBuf, name: &str, encode: F)
where
F: FnOnce(&mut Encoder),
F: FnOnce(&mut FileEncoder) -> FileEncodeResult,
{
debug!("save: storing data in {}", path_buf.display());
// delete the old dep-graph, if any
// Delete the old file, if any.
// Note: It's important that we actually delete the old file and not just
// truncate and overwrite it, since it might be a shared hard-link, the
// underlying data of which we don't want to modify
@ -109,7 +109,8 @@ fn save_in<F>(sess: &Session, path_buf: PathBuf, encode: F)
Err(err) if err.kind() == io::ErrorKind::NotFound => (),
Err(err) => {
sess.err(&format!(
"unable to delete old dep-graph at `{}`: {}",
"unable to delete old {} at `{}`: {}",
name,
path_buf.display(),
err
));
@ -117,26 +118,35 @@ fn save_in<F>(sess: &Session, path_buf: PathBuf, encode: F)
}
}
// generate the data in a memory buffer
let mut encoder = Encoder::new(Vec::new());
file_format::write_file_header(&mut encoder, sess.is_nightly_build());
encode(&mut encoder);
// write the data out
let data = encoder.into_inner();
match fs::write(&path_buf, data) {
Ok(_) => {
debug!("save: data written to disk successfully");
}
let mut encoder = match FileEncoder::new(&path_buf) {
Ok(encoder) => encoder,
Err(err) => {
sess.err(&format!("failed to write dep-graph to `{}`: {}", path_buf.display(), err));
sess.err(&format!("failed to create {} at `{}`: {}", name, path_buf.display(), err));
return;
}
};
if let Err(err) = file_format::write_file_header(&mut encoder, sess.is_nightly_build()) {
sess.err(&format!("failed to write {} header to `{}`: {}", name, path_buf.display(), err));
return;
}
if let Err(err) = encode(&mut encoder) {
sess.err(&format!("failed to write {} to `{}`: {}", name, path_buf.display(), err));
return;
}
if let Err(err) = encoder.flush() {
sess.err(&format!("failed to flush {} to `{}`: {}", name, path_buf.display(), err));
return;
}
debug!("save: data written to disk successfully");
}
fn encode_dep_graph(tcx: TyCtxt<'_>, encoder: &mut Encoder) {
fn encode_dep_graph(tcx: TyCtxt<'_>, encoder: &mut FileEncoder) -> FileEncodeResult {
// First encode the commandline arguments hash
tcx.sess.opts.dep_tracking_hash().encode(encoder).unwrap();
tcx.sess.opts.dep_tracking_hash().encode(encoder)?;
// Encode the graph data.
let serialized_graph =
@ -214,15 +224,13 @@ struct Stat {
println!("[incremental]");
}
tcx.sess.time("incr_comp_encode_serialized_dep_graph", || {
serialized_graph.encode(encoder).unwrap();
});
tcx.sess.time("incr_comp_encode_serialized_dep_graph", || serialized_graph.encode(encoder))
}
fn encode_work_product_index(
work_products: &FxHashMap<WorkProductId, WorkProduct>,
encoder: &mut Encoder,
) {
encoder: &mut FileEncoder,
) -> FileEncodeResult {
let serialized_products: Vec<_> = work_products
.iter()
.map(|(id, work_product)| SerializedWorkProduct {
@ -231,11 +239,9 @@ fn encode_work_product_index(
})
.collect();
serialized_products.encode(encoder).unwrap();
serialized_products.encode(encoder)
}
fn encode_query_cache(tcx: TyCtxt<'_>, encoder: &mut Encoder) {
tcx.sess.time("incr_comp_serialize_result_cache", || {
tcx.serialize_query_result_cache(encoder).unwrap();
})
fn encode_query_cache(tcx: TyCtxt<'_>, encoder: &mut FileEncoder) -> FileEncodeResult {
tcx.sess.time("incr_comp_serialize_result_cache", || tcx.serialize_query_result_cache(encoder))
}

View file

@ -308,7 +308,7 @@ fn encode(&self, s: &mut EncodeContext<'a, 'tcx>) -> opaque::EncodeResult {
impl<'a, 'tcx> FingerprintEncoder for EncodeContext<'a, 'tcx> {
fn encode_fingerprint(&mut self, f: &Fingerprint) -> Result<(), Self::Error> {
f.encode_opaque(&mut self.opaque)
self.opaque.encode_fingerprint(f)
}
}

View file

@ -50,22 +50,6 @@ fn variant(&self) -> &Self::Variant {
}
}
pub trait OpaqueEncoder: Encoder {
fn opaque(&mut self) -> &mut rustc_serialize::opaque::Encoder;
fn encoder_position(&self) -> usize;
}
impl OpaqueEncoder for rustc_serialize::opaque::Encoder {
#[inline]
fn opaque(&mut self) -> &mut rustc_serialize::opaque::Encoder {
self
}
#[inline]
fn encoder_position(&self) -> usize {
self.position()
}
}
pub trait TyEncoder<'tcx>: Encoder {
const CLEAR_CROSS_CRATE: bool;

View file

@ -47,6 +47,7 @@
};
use rustc_index::vec::{Idx, IndexVec};
use rustc_macros::HashStable;
use rustc_serialize::opaque::{FileEncodeResult, FileEncoder};
use rustc_session::config::{BorrowckMode, CrateType, OutputFilenames};
use rustc_session::lint::{Level, Lint};
use rustc_session::Session;
@ -1336,10 +1337,7 @@ pub fn allocate_metadata_dep_nodes(self) {
}
}
pub fn serialize_query_result_cache<E>(self, encoder: &mut E) -> Result<(), E::Error>
where
E: ty::codec::OpaqueEncoder,
{
pub fn serialize_query_result_cache(self, encoder: &mut FileEncoder) -> FileEncodeResult {
self.queries.on_disk_cache.as_ref().map(|c| c.serialize(self, encoder)).unwrap_or(Ok(()))
}

View file

@ -1,7 +1,7 @@
use crate::dep_graph::{DepNode, DepNodeIndex, SerializedDepNodeIndex};
use crate::mir::interpret::{AllocDecodingSession, AllocDecodingState};
use crate::mir::{self, interpret};
use crate::ty::codec::{OpaqueEncoder, RefDecodable, TyDecoder, TyEncoder};
use crate::ty::codec::{RefDecodable, TyDecoder, TyEncoder};
use crate::ty::context::TyCtxt;
use crate::ty::{self, Ty};
use rustc_data_structures::fingerprint::{Fingerprint, FingerprintDecoder, FingerprintEncoder};
@ -14,7 +14,10 @@
use rustc_hir::definitions::DefPathHash;
use rustc_hir::definitions::Definitions;
use rustc_index::vec::{Idx, IndexVec};
use rustc_serialize::{opaque, Decodable, Decoder, Encodable, Encoder};
use rustc_serialize::{
opaque::{self, FileEncodeResult, FileEncoder},
Decodable, Decoder, Encodable, Encoder,
};
use rustc_session::{CrateDisambiguator, Session};
use rustc_span::hygiene::{
ExpnDataDecodeMode, ExpnDataEncodeMode, ExpnId, HygieneDecodeContext, HygieneEncodeContext,
@ -241,10 +244,11 @@ pub fn new_empty(source_map: &'sess SourceMap) -> Self {
}
}
pub fn serialize<'tcx, E>(&self, tcx: TyCtxt<'tcx>, encoder: &mut E) -> Result<(), E::Error>
where
E: OpaqueEncoder,
{
pub fn serialize<'tcx>(
&self,
tcx: TyCtxt<'tcx>,
encoder: &mut FileEncoder,
) -> FileEncodeResult {
// Serializing the `DepGraph` should not modify it.
tcx.dep_graph.with_ignore(|| {
// Allocate `SourceFileIndex`es.
@ -298,14 +302,14 @@ pub fn serialize<'tcx, E>(&self, tcx: TyCtxt<'tcx>, encoder: &mut E) -> Result<(
// Encode query results.
let mut query_result_index = EncodedQueryResultIndex::new();
tcx.sess.time("encode_query_results", || {
tcx.sess.time("encode_query_results", || -> FileEncodeResult {
let enc = &mut encoder;
let qri = &mut query_result_index;
macro_rules! encode_queries {
($($query:ident,)*) => {
$(
encode_query_results::<ty::query::queries::$query<'_>, _>(
encode_query_results::<ty::query::queries::$query<'_>>(
tcx,
enc,
qri
@ -324,15 +328,17 @@ macro_rules! encode_queries {
.current_diagnostics
.borrow()
.iter()
.map(|(dep_node_index, diagnostics)| {
let pos = AbsoluteBytePos::new(encoder.position());
// Let's make sure we get the expected type here.
let diagnostics: &EncodedDiagnostics = diagnostics;
let dep_node_index = SerializedDepNodeIndex::new(dep_node_index.index());
encoder.encode_tagged(dep_node_index, diagnostics)?;
.map(
|(dep_node_index, diagnostics)| -> Result<_, <FileEncoder as Encoder>::Error> {
let pos = AbsoluteBytePos::new(encoder.position());
// Let's make sure we get the expected type here.
let diagnostics: &EncodedDiagnostics = diagnostics;
let dep_node_index = SerializedDepNodeIndex::new(dep_node_index.index());
encoder.encode_tagged(dep_node_index, diagnostics)?;
Ok((dep_node_index, pos))
})
Ok((dep_node_index, pos))
},
)
.collect::<Result<_, _>>()?;
let interpret_alloc_index = {
@ -375,13 +381,13 @@ macro_rules! encode_queries {
hygiene_encode_context.encode(
&mut encoder,
|encoder, index, ctxt_data| {
|encoder, index, ctxt_data| -> FileEncodeResult {
let pos = AbsoluteBytePos::new(encoder.position());
encoder.encode_tagged(TAG_SYNTAX_CONTEXT, ctxt_data)?;
syntax_contexts.insert(index, pos);
Ok(())
},
|encoder, index, expn_data| {
|encoder, index, expn_data| -> FileEncodeResult {
let pos = AbsoluteBytePos::new(encoder.position());
encoder.encode_tagged(TAG_EXPN_DATA, expn_data)?;
expn_ids.insert(index, pos);
@ -410,7 +416,7 @@ macro_rules! encode_queries {
// Encode the position of the footer as the last 8 bytes of the
// file so we know where to look for it.
IntEncodedWithFixedSize(footer_pos).encode(encoder.encoder.opaque())?;
IntEncodedWithFixedSize(footer_pos).encode(encoder.encoder)?;
// DO NOT WRITE ANYTHING TO THE ENCODER AFTER THIS POINT! The address
// of the footer must be the last thing in the data stream.
@ -964,6 +970,17 @@ fn decode(d: &mut CacheDecoder<'a, 'tcx>) -> Result<Self, String> {
//- ENCODING -------------------------------------------------------------------
trait OpaqueEncoder: Encoder {
fn position(&self) -> usize;
}
impl OpaqueEncoder for FileEncoder {
#[inline]
fn position(&self) -> usize {
FileEncoder::position(self)
}
}
/// An encoder that can write to the incremental compilation cache.
struct CacheEncoder<'a, 'tcx, E: OpaqueEncoder> {
tcx: TyCtxt<'tcx>,
@ -1005,9 +1022,9 @@ fn encode_tagged<T: Encodable<Self>, V: Encodable<Self>>(
}
}
impl<'a, 'tcx> FingerprintEncoder for CacheEncoder<'a, 'tcx, rustc_serialize::opaque::Encoder> {
fn encode_fingerprint(&mut self, f: &Fingerprint) -> opaque::EncodeResult {
f.encode_opaque(self.encoder)
impl<'a, 'tcx, E: OpaqueEncoder> FingerprintEncoder for CacheEncoder<'a, 'tcx, E> {
fn encode_fingerprint(&mut self, f: &Fingerprint) -> Result<(), E::Error> {
self.encoder.encode_fingerprint(f)
}
}
@ -1073,7 +1090,7 @@ impl<'a, 'tcx, E> TyEncoder<'tcx> for CacheEncoder<'a, 'tcx, E>
const CLEAR_CROSS_CRATE: bool = false;
fn position(&self) -> usize {
self.encoder.encoder_position()
self.encoder.position()
}
fn type_shorthands(&mut self) -> &mut FxHashMap<Ty<'tcx>, usize> {
&mut self.type_shorthands
@ -1159,12 +1176,12 @@ fn emit_unit(&mut self) -> Result<(), Self::Error> {
}
}
// This ensures that the `Encodable<opaque::Encoder>::encode` specialization for byte slices
// is used when a `CacheEncoder` having an `opaque::Encoder` is passed to `Encodable::encode`.
// This ensures that the `Encodable<opaque::FileEncoder>::encode` specialization for byte slices
// is used when a `CacheEncoder` having an `opaque::FileEncoder` is passed to `Encodable::encode`.
// Unfortunately, we have to manually opt into specializations this way, given how `CacheEncoder`
// and the encoding traits currently work.
impl<'a, 'tcx> Encodable<CacheEncoder<'a, 'tcx, opaque::Encoder>> for [u8] {
fn encode(&self, e: &mut CacheEncoder<'a, 'tcx, opaque::Encoder>) -> opaque::EncodeResult {
impl<'a, 'tcx> Encodable<CacheEncoder<'a, 'tcx, FileEncoder>> for [u8] {
fn encode(&self, e: &mut CacheEncoder<'a, 'tcx, FileEncoder>) -> FileEncodeResult {
self.encode(e.encoder)
}
}
@ -1176,8 +1193,8 @@ impl IntEncodedWithFixedSize {
pub const ENCODED_SIZE: usize = 8;
}
impl Encodable<opaque::Encoder> for IntEncodedWithFixedSize {
fn encode(&self, e: &mut opaque::Encoder) -> Result<(), !> {
impl<E: OpaqueEncoder> Encodable<E> for IntEncodedWithFixedSize {
fn encode(&self, e: &mut E) -> Result<(), E::Error> {
let start_pos = e.position();
for i in 0..IntEncodedWithFixedSize::ENCODED_SIZE {
((self.0 >> (i * 8)) as u8).encode(e)?;
@ -1205,15 +1222,14 @@ fn decode(decoder: &mut opaque::Decoder<'a>) -> Result<IntEncodedWithFixedSize,
}
}
fn encode_query_results<'a, 'tcx, Q, E>(
fn encode_query_results<'a, 'tcx, Q>(
tcx: TyCtxt<'tcx>,
encoder: &mut CacheEncoder<'a, 'tcx, E>,
encoder: &mut CacheEncoder<'a, 'tcx, FileEncoder>,
query_result_index: &mut EncodedQueryResultIndex,
) -> Result<(), E::Error>
) -> FileEncodeResult
where
Q: super::QueryDescription<TyCtxt<'tcx>> + super::QueryAccessors<TyCtxt<'tcx>>,
Q::Value: Encodable<CacheEncoder<'a, 'tcx, E>>,
E: 'a + OpaqueEncoder,
Q::Value: Encodable<CacheEncoder<'a, 'tcx, FileEncoder>>,
{
let _timer = tcx
.sess
@ -1230,7 +1246,7 @@ fn encode_query_results<'a, 'tcx, Q, E>(
// Record position of the cache entry.
query_result_index
.push((dep_node, AbsoluteBytePos::new(encoder.encoder.opaque().position())));
.push((dep_node, AbsoluteBytePos::new(encoder.encoder.position())));
// Encode the type check tables with the `SerializedDepNodeIndex`
// as tag.

View file

@ -1,16 +1,45 @@
#![macro_use]
macro_rules! max_leb128_len {
($int_ty:ty) => {
// The longest LEB128 encoding for an integer uses 7 bits per byte.
(std::mem::size_of::<$int_ty>() * 8 + 6) / 7
};
}
// Returns the longest LEB128 encoding of all supported integer types.
pub const fn max_leb128_len() -> usize {
max_leb128_len!(u128)
}
macro_rules! impl_write_unsigned_leb128 {
($fn_name:ident, $int_ty:ident) => {
($fn_name:ident, $int_ty:ty) => {
#[inline]
pub fn $fn_name(out: &mut Vec<u8>, mut value: $int_ty) {
pub fn $fn_name(
out: &mut [::std::mem::MaybeUninit<u8>; max_leb128_len!($int_ty)],
mut value: $int_ty,
) -> &[u8] {
let mut i = 0;
loop {
if value < 0x80 {
out.push(value as u8);
unsafe {
*out.get_unchecked_mut(i).as_mut_ptr() = value as u8;
}
i += 1;
break;
} else {
out.push(((value & 0x7f) | 0x80) as u8);
unsafe {
*out.get_unchecked_mut(i).as_mut_ptr() = ((value & 0x7f) | 0x80) as u8;
}
value >>= 7;
i += 1;
}
}
unsafe { ::std::mem::MaybeUninit::slice_assume_init_ref(&out.get_unchecked(..i)) }
}
};
}
@ -22,7 +51,7 @@ pub fn $fn_name(out: &mut Vec<u8>, mut value: $int_ty) {
impl_write_unsigned_leb128!(write_usize_leb128, usize);
macro_rules! impl_read_unsigned_leb128 {
($fn_name:ident, $int_ty:ident) => {
($fn_name:ident, $int_ty:ty) => {
#[inline]
pub fn $fn_name(slice: &[u8]) -> ($int_ty, usize) {
let mut result = 0;
@ -49,39 +78,46 @@ pub fn $fn_name(slice: &[u8]) -> ($int_ty, usize) {
impl_read_unsigned_leb128!(read_u128_leb128, u128);
impl_read_unsigned_leb128!(read_usize_leb128, usize);
#[inline]
/// encodes an integer using signed leb128 encoding and stores
/// the result using a callback function.
///
/// The callback `write` is called once for each position
/// that is to be written to with the byte to be encoded
/// at that position.
pub fn write_signed_leb128_to<W>(mut value: i128, mut write: W)
where
W: FnMut(u8),
{
loop {
let mut byte = (value as u8) & 0x7f;
value >>= 7;
let more =
!(((value == 0) && ((byte & 0x40) == 0)) || ((value == -1) && ((byte & 0x40) != 0)));
macro_rules! impl_write_signed_leb128 {
($fn_name:ident, $int_ty:ty) => {
#[inline]
pub fn $fn_name(
out: &mut [::std::mem::MaybeUninit<u8>; max_leb128_len!($int_ty)],
mut value: $int_ty,
) -> &[u8] {
let mut i = 0;
if more {
byte |= 0x80; // Mark this byte to show that more bytes will follow.
loop {
let mut byte = (value as u8) & 0x7f;
value >>= 7;
let more = !(((value == 0) && ((byte & 0x40) == 0))
|| ((value == -1) && ((byte & 0x40) != 0)));
if more {
byte |= 0x80; // Mark this byte to show that more bytes will follow.
}
unsafe {
*out.get_unchecked_mut(i).as_mut_ptr() = byte;
}
i += 1;
if !more {
break;
}
}
unsafe { ::std::mem::MaybeUninit::slice_assume_init_ref(&out.get_unchecked(..i)) }
}
write(byte);
if !more {
break;
}
}
};
}
#[inline]
pub fn write_signed_leb128(out: &mut Vec<u8>, value: i128) {
write_signed_leb128_to(value, |v| out.push(v))
}
impl_write_signed_leb128!(write_i16_leb128, i16);
impl_write_signed_leb128!(write_i32_leb128, i32);
impl_write_signed_leb128!(write_i64_leb128, i64);
impl_write_signed_leb128!(write_i128_leb128, i128);
impl_write_signed_leb128!(write_isize_leb128, isize);
#[inline]
pub fn read_signed_leb128(data: &[u8], start_position: usize) -> (i128, usize) {

View file

@ -16,6 +16,9 @@
#![cfg_attr(bootstrap, feature(min_const_generics))]
#![feature(min_specialization)]
#![feature(vec_spare_capacity)]
#![feature(core_intrinsics)]
#![feature(maybe_uninit_slice)]
#![feature(new_uninit)]
#![cfg_attr(test, feature(test))]
#![allow(rustc::internal)]

View file

@ -1,7 +1,10 @@
use crate::leb128::{self, read_signed_leb128, write_signed_leb128};
use crate::leb128::{self, max_leb128_len, read_signed_leb128};
use crate::serialize;
use std::borrow::Cow;
use std::fs::File;
use std::io::{self, Write};
use std::mem::MaybeUninit;
use std::path::Path;
use std::ptr;
// -----------------------------------------------------------------------------
@ -23,22 +26,35 @@ pub fn into_inner(self) -> Vec<u8> {
self.data
}
#[inline]
pub fn position(&self) -> usize {
self.data.len()
}
#[inline]
pub fn emit_raw_bytes(&mut self, s: &[u8]) {
self.data.extend_from_slice(s);
}
}
macro_rules! write_uleb128 {
($enc:expr, $value:expr, $fun:ident) => {{
leb128::$fun(&mut $enc.data, $value);
Ok(())
}};
}
macro_rules! write_leb128 {
($enc:expr, $value:expr, $int_ty:ty, $fun:ident) => {{
const MAX_ENCODED_LEN: usize = max_leb128_len!($int_ty);
let old_len = $enc.data.len();
if MAX_ENCODED_LEN > $enc.data.capacity() - old_len {
$enc.data.reserve(MAX_ENCODED_LEN);
}
// SAFETY: The above check and `reserve` ensures that there is enough
// room to write the encoded value to the vector's internal buffer.
unsafe {
let buf = &mut *($enc.data.as_mut_ptr().add(old_len)
as *mut [MaybeUninit<u8>; MAX_ENCODED_LEN]);
let encoded = leb128::$fun(buf, $value);
$enc.data.set_len(old_len + encoded.len());
}
macro_rules! write_sleb128 {
($enc:expr, $value:expr) => {{
write_signed_leb128(&mut $enc.data, $value as i128);
Ok(())
}};
}
@ -53,27 +69,27 @@ fn emit_unit(&mut self) -> EncodeResult {
#[inline]
fn emit_usize(&mut self, v: usize) -> EncodeResult {
write_uleb128!(self, v, write_usize_leb128)
write_leb128!(self, v, usize, write_usize_leb128)
}
#[inline]
fn emit_u128(&mut self, v: u128) -> EncodeResult {
write_uleb128!(self, v, write_u128_leb128)
write_leb128!(self, v, u128, write_u128_leb128)
}
#[inline]
fn emit_u64(&mut self, v: u64) -> EncodeResult {
write_uleb128!(self, v, write_u64_leb128)
write_leb128!(self, v, u64, write_u64_leb128)
}
#[inline]
fn emit_u32(&mut self, v: u32) -> EncodeResult {
write_uleb128!(self, v, write_u32_leb128)
write_leb128!(self, v, u32, write_u32_leb128)
}
#[inline]
fn emit_u16(&mut self, v: u16) -> EncodeResult {
write_uleb128!(self, v, write_u16_leb128)
write_leb128!(self, v, u16, write_u16_leb128)
}
#[inline]
@ -84,27 +100,27 @@ fn emit_u8(&mut self, v: u8) -> EncodeResult {
#[inline]
fn emit_isize(&mut self, v: isize) -> EncodeResult {
write_sleb128!(self, v)
write_leb128!(self, v, isize, write_isize_leb128)
}
#[inline]
fn emit_i128(&mut self, v: i128) -> EncodeResult {
write_sleb128!(self, v)
write_leb128!(self, v, i128, write_i128_leb128)
}
#[inline]
fn emit_i64(&mut self, v: i64) -> EncodeResult {
write_sleb128!(self, v)
write_leb128!(self, v, i64, write_i64_leb128)
}
#[inline]
fn emit_i32(&mut self, v: i32) -> EncodeResult {
write_sleb128!(self, v)
write_leb128!(self, v, i32, write_i32_leb128)
}
#[inline]
fn emit_i16(&mut self, v: i16) -> EncodeResult {
write_sleb128!(self, v)
write_leb128!(self, v, i16, write_i16_leb128)
}
#[inline]
@ -143,10 +159,354 @@ fn emit_str(&mut self, v: &str) -> EncodeResult {
}
}
impl Encoder {
pub type FileEncodeResult = Result<(), io::Error>;
// `FileEncoder` encodes data to file via fixed-size buffer.
//
// When encoding large amounts of data to a file, using `FileEncoder` may be
// preferred over using `Encoder` to encode to a `Vec`, and then writing the
// `Vec` to file, as the latter uses as much memory as there is encoded data,
// while the former uses the fixed amount of memory allocated to the buffer.
// `FileEncoder` also has the advantage of not needing to reallocate as data
// is appended to it, but the disadvantage of requiring more error handling,
// which has some runtime overhead.
pub struct FileEncoder {
// The input buffer. For adequate performance, we need more control over
// buffering than `BufWriter` offers. If `BufWriter` ever offers a raw
// buffer access API, we can use it, and remove `buf` and `buffered`.
buf: Box<[MaybeUninit<u8>]>,
buffered: usize,
flushed: usize,
file: File,
}
impl FileEncoder {
pub fn new<P: AsRef<Path>>(path: P) -> io::Result<Self> {
const DEFAULT_BUF_SIZE: usize = 8192;
FileEncoder::with_capacity(path, DEFAULT_BUF_SIZE)
}
pub fn with_capacity<P: AsRef<Path>>(path: P, capacity: usize) -> io::Result<Self> {
// Require capacity at least as large as the largest LEB128 encoding
// here, so that we don't have to check or handle this on every write.
assert!(capacity >= max_leb128_len());
// Require capacity small enough such that some capacity checks can be
// done using guaranteed non-overflowing add rather than sub, which
// shaves an instruction off those code paths (on x86 at least).
assert!(capacity <= usize::MAX - max_leb128_len());
let file = File::create(path)?;
Ok(FileEncoder { buf: Box::new_uninit_slice(capacity), buffered: 0, flushed: 0, file })
}
#[inline]
pub fn position(&self) -> usize {
self.data.len()
// Tracking position this way instead of having a `self.position` field
// means that we don't have to update the position on every write call.
self.flushed + self.buffered
}
#[inline]
pub fn emit_raw_bytes(&mut self, s: &[u8]) -> FileEncodeResult {
self.write_all(s)
}
pub fn flush(&mut self) -> FileEncodeResult {
// This is basically a copy of `BufWriter::flush`. If `BufWriter` ever
// offers a raw buffer access API, we can use it, and remove this.
/// Helper struct to ensure the buffer is updated after all the writes
/// are complete. It tracks the number of written bytes and drains them
/// all from the front of the buffer when dropped.
struct BufGuard<'a> {
buffer: &'a mut [u8],
encoder_buffered: &'a mut usize,
encoder_flushed: &'a mut usize,
flushed: usize,
}
impl<'a> BufGuard<'a> {
fn new(
buffer: &'a mut [u8],
encoder_buffered: &'a mut usize,
encoder_flushed: &'a mut usize,
) -> Self {
assert_eq!(buffer.len(), *encoder_buffered);
Self { buffer, encoder_buffered, encoder_flushed, flushed: 0 }
}
/// The unwritten part of the buffer
fn remaining(&self) -> &[u8] {
&self.buffer[self.flushed..]
}
/// Flag some bytes as removed from the front of the buffer
fn consume(&mut self, amt: usize) {
self.flushed += amt;
}
/// true if all of the bytes have been written
fn done(&self) -> bool {
self.flushed >= *self.encoder_buffered
}
}
impl Drop for BufGuard<'_> {
fn drop(&mut self) {
if self.flushed > 0 {
if self.done() {
*self.encoder_flushed += *self.encoder_buffered;
*self.encoder_buffered = 0;
} else {
self.buffer.copy_within(self.flushed.., 0);
*self.encoder_flushed += self.flushed;
*self.encoder_buffered -= self.flushed;
}
}
}
}
let mut guard = BufGuard::new(
unsafe { MaybeUninit::slice_assume_init_mut(&mut self.buf[..self.buffered]) },
&mut self.buffered,
&mut self.flushed,
);
while !guard.done() {
match self.file.write(guard.remaining()) {
Ok(0) => {
return Err(io::Error::new(
io::ErrorKind::WriteZero,
"failed to write the buffered data",
));
}
Ok(n) => guard.consume(n),
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
Ok(())
}
#[inline]
fn capacity(&self) -> usize {
self.buf.len()
}
#[inline]
fn write_one(&mut self, value: u8) -> FileEncodeResult {
// We ensure this during `FileEncoder` construction.
debug_assert!(self.capacity() >= 1);
let mut buffered = self.buffered;
if std::intrinsics::unlikely(buffered >= self.capacity()) {
self.flush()?;
buffered = 0;
}
// SAFETY: The above check and `flush` ensures that there is enough
// room to write the input to the buffer.
unsafe {
*MaybeUninit::slice_as_mut_ptr(&mut self.buf).add(buffered) = value;
}
self.buffered = buffered + 1;
Ok(())
}
#[inline]
fn write_all(&mut self, buf: &[u8]) -> FileEncodeResult {
let capacity = self.capacity();
let buf_len = buf.len();
if std::intrinsics::likely(buf_len <= capacity) {
let mut buffered = self.buffered;
if std::intrinsics::unlikely(buf_len > capacity - buffered) {
self.flush()?;
buffered = 0;
}
// SAFETY: The above check and `flush` ensures that there is enough
// room to write the input to the buffer.
unsafe {
let src = buf.as_ptr();
let dst = MaybeUninit::slice_as_mut_ptr(&mut self.buf).add(buffered);
ptr::copy_nonoverlapping(src, dst, buf_len);
}
self.buffered = buffered + buf_len;
Ok(())
} else {
self.write_all_unbuffered(buf)
}
}
fn write_all_unbuffered(&mut self, mut buf: &[u8]) -> FileEncodeResult {
if self.buffered > 0 {
self.flush()?;
}
// This is basically a copy of `Write::write_all` but also updates our
// `self.flushed`. It's necessary because `Write::write_all` does not
// return the number of bytes written when an error is encountered, and
// without that, we cannot accurately update `self.flushed` on error.
while !buf.is_empty() {
match self.file.write(buf) {
Ok(0) => {
return Err(io::Error::new(
io::ErrorKind::WriteZero,
"failed to write whole buffer",
));
}
Ok(n) => {
buf = &buf[n..];
self.flushed += n;
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
Ok(())
}
}
impl Drop for FileEncoder {
fn drop(&mut self) {
let _result = self.flush();
}
}
macro_rules! file_encoder_write_leb128 {
($enc:expr, $value:expr, $int_ty:ty, $fun:ident) => {{
const MAX_ENCODED_LEN: usize = max_leb128_len!($int_ty);
// We ensure this during `FileEncoder` construction.
debug_assert!($enc.capacity() >= MAX_ENCODED_LEN);
let mut buffered = $enc.buffered;
// This can't overflow. See assertion in `FileEncoder::with_capacity`.
if std::intrinsics::unlikely(buffered + MAX_ENCODED_LEN > $enc.capacity()) {
$enc.flush()?;
buffered = 0;
}
// SAFETY: The above check and flush ensures that there is enough
// room to write the encoded value to the buffer.
let buf = unsafe {
&mut *($enc.buf.as_mut_ptr().add(buffered) as *mut [MaybeUninit<u8>; MAX_ENCODED_LEN])
};
let encoded = leb128::$fun(buf, $value);
$enc.buffered = buffered + encoded.len();
Ok(())
}};
}
impl serialize::Encoder for FileEncoder {
type Error = io::Error;
#[inline]
fn emit_unit(&mut self) -> FileEncodeResult {
Ok(())
}
#[inline]
fn emit_usize(&mut self, v: usize) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, usize, write_usize_leb128)
}
#[inline]
fn emit_u128(&mut self, v: u128) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, u128, write_u128_leb128)
}
#[inline]
fn emit_u64(&mut self, v: u64) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, u64, write_u64_leb128)
}
#[inline]
fn emit_u32(&mut self, v: u32) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, u32, write_u32_leb128)
}
#[inline]
fn emit_u16(&mut self, v: u16) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, u16, write_u16_leb128)
}
#[inline]
fn emit_u8(&mut self, v: u8) -> FileEncodeResult {
self.write_one(v)
}
#[inline]
fn emit_isize(&mut self, v: isize) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, isize, write_isize_leb128)
}
#[inline]
fn emit_i128(&mut self, v: i128) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, i128, write_i128_leb128)
}
#[inline]
fn emit_i64(&mut self, v: i64) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, i64, write_i64_leb128)
}
#[inline]
fn emit_i32(&mut self, v: i32) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, i32, write_i32_leb128)
}
#[inline]
fn emit_i16(&mut self, v: i16) -> FileEncodeResult {
file_encoder_write_leb128!(self, v, i16, write_i16_leb128)
}
#[inline]
fn emit_i8(&mut self, v: i8) -> FileEncodeResult {
let as_u8: u8 = unsafe { std::mem::transmute(v) };
self.emit_u8(as_u8)
}
#[inline]
fn emit_bool(&mut self, v: bool) -> FileEncodeResult {
self.emit_u8(if v { 1 } else { 0 })
}
#[inline]
fn emit_f64(&mut self, v: f64) -> FileEncodeResult {
let as_u64: u64 = v.to_bits();
self.emit_u64(as_u64)
}
#[inline]
fn emit_f32(&mut self, v: f32) -> FileEncodeResult {
let as_u32: u32 = v.to_bits();
self.emit_u32(as_u32)
}
#[inline]
fn emit_char(&mut self, v: char) -> FileEncodeResult {
self.emit_u32(v as u32)
}
#[inline]
fn emit_str(&mut self, v: &str) -> FileEncodeResult {
self.emit_usize(v.len())?;
self.emit_raw_bytes(v.as_bytes())
}
}
@ -342,6 +702,13 @@ fn encode(&self, e: &mut Encoder) -> EncodeResult {
}
}
impl serialize::Encodable<FileEncoder> for [u8] {
fn encode(&self, e: &mut FileEncoder) -> FileEncodeResult {
serialize::Encoder::emit_usize(e, self.len())?;
e.emit_raw_bytes(self)
}
}
// Specialize decoding `Vec<u8>`. This specialization also applies to decoding `Box<[u8]>`s, etc.,
// since the default implementations call `decode` to produce a `Vec<u8>` internally.
impl<'a> serialize::Decodable<Decoder<'a>> for Vec<u8> {

View file

@ -1,4 +1,8 @@
#![feature(maybe_uninit_slice)]
#![feature(maybe_uninit_uninit_array)]
use rustc_serialize::leb128::*;
use std::mem::MaybeUninit;
macro_rules! impl_test_unsigned_leb128 {
($test_name:ident, $write_fn_name:ident, $read_fn_name:ident, $int_ty:ident) => {
@ -7,7 +11,8 @@ fn $test_name() {
let mut stream = Vec::new();
for x in 0..62 {
$write_fn_name(&mut stream, (3u64 << x) as $int_ty);
let mut buf = MaybeUninit::uninit_array();
stream.extend($write_fn_name(&mut buf, (3u64 << x) as $int_ty));
}
let mut position = 0;
@ -28,18 +33,34 @@ fn $test_name() {
impl_test_unsigned_leb128!(test_u128_leb128, write_u128_leb128, read_u128_leb128, u128);
impl_test_unsigned_leb128!(test_usize_leb128, write_usize_leb128, read_usize_leb128, usize);
#[test]
fn test_signed_leb128() {
let values: Vec<_> = (-500..500).map(|i| i * 0x12345789ABCDEF).collect();
let mut stream = Vec::new();
for &x in &values {
write_signed_leb128(&mut stream, x);
}
let mut pos = 0;
for &x in &values {
let (value, bytes_read) = read_signed_leb128(&mut stream, pos);
pos += bytes_read;
assert_eq!(x, value);
}
assert_eq!(pos, stream.len());
macro_rules! impl_test_signed_leb128 {
($test_name:ident, $write_fn_name:ident, $int_ty:ident) => {
#[test]
fn $test_name() {
let values: Vec<_> = (-500..500)
.map(|i| ((i as $int_ty).wrapping_mul(0x12345789ABCDEF_i64 as $int_ty)))
.collect();
let mut stream = Vec::new();
for &x in &values {
let mut buf = MaybeUninit::uninit_array();
stream.extend($write_fn_name(&mut buf, x));
}
let mut position = 0;
for &x in &values {
let expected = x as i128;
let (actual, bytes_read) = read_signed_leb128(&mut stream, position);
assert_eq!(expected, actual);
position += bytes_read;
}
assert_eq!(stream.len(), position);
}
};
}
impl_test_signed_leb128!(test_i16_leb128, write_i16_leb128, i16);
impl_test_signed_leb128!(test_i32_leb128, write_i32_leb128, i32);
impl_test_signed_leb128!(test_i64_leb128, write_i64_leb128, i64);
impl_test_signed_leb128!(test_i128_leb128, write_i128_leb128, i128);
impl_test_signed_leb128!(test_isize_leb128, write_isize_leb128, isize);