Add tool src/tools/coverage-dump for use by some new coverage tests

This commit is contained in:
Zalathar 2023-08-11 11:52:37 +10:00
parent 04374cd742
commit 1367104cb2
11 changed files with 562 additions and 1 deletions

View File

@ -722,6 +722,18 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "coverage-dump"
version = "0.1.0"
dependencies = [
"anyhow",
"leb128",
"md-5",
"miniz_oxide",
"regex",
"rustc-demangle",
]
[[package]]
name = "coverage_test_macros"
version = "0.0.0"
@ -2041,6 +2053,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "leb128"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67"
[[package]]
name = "levenshtein"
version = "1.0.5"

View File

@ -43,6 +43,7 @@ members = [
"src/tools/generate-windows-sys",
"src/tools/rustdoc-gui-test",
"src/tools/opt-dist",
"src/tools/coverage-dump",
]
exclude = [

View File

@ -703,7 +703,8 @@ macro_rules! describe {
llvm::Lld,
llvm::CrtBeginEnd,
tool::RustdocGUITest,
tool::OptimizedDist
tool::OptimizedDist,
tool::CoverageDump,
),
Kind::Check | Kind::Clippy | Kind::Fix => describe!(
check::Std,

View File

@ -306,6 +306,7 @@ fn run(self, builder: &Builder<'_>) -> PathBuf {
GenerateWindowsSys, "src/tools/generate-windows-sys", "generate-windows-sys";
RustdocGUITest, "src/tools/rustdoc-gui-test", "rustdoc-gui-test", is_unstable_tool = true, allow_features = "test";
OptimizedDist, "src/tools/opt-dist", "opt-dist";
CoverageDump, "src/tools/coverage-dump", "coverage-dump";
);
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]

View File

@ -0,0 +1,14 @@
[package]
name = "coverage-dump"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.71"
leb128 = "0.2.5"
md5 = { package = "md-5" , version = "0.10.5" }
miniz_oxide = "0.7.1"
regex = "1.8.4"
rustc-demangle = "0.1.23"

View File

@ -0,0 +1,8 @@
This tool extracts coverage mapping information from an LLVM IR assembly file
(`.ll`), and prints it in a more human-readable form that can be used for
snapshot tests.
The output format is mostly arbitrary, so it's OK to change the output as long
as any affected tests are also re-blessed. However, the output should be
consistent across different executions on different platforms, so avoid
printing any information that is platform-specific or non-deterministic.

View File

@ -0,0 +1,296 @@
use crate::parser::{unescape_llvm_string_contents, Parser};
use anyhow::{anyhow, Context};
use regex::Regex;
use std::collections::HashMap;
use std::fmt::{self, Debug, Write as _};
use std::sync::OnceLock;
pub(crate) fn dump_covfun_mappings(
llvm_ir: &str,
function_names: &HashMap<u64, String>,
) -> anyhow::Result<()> {
// Extract function coverage entries from the LLVM IR assembly, and associate
// each entry with its (demangled) name.
let mut covfun_entries = llvm_ir
.lines()
.filter_map(covfun_line_data)
.map(|line_data| (function_names.get(&line_data.name_hash).map(String::as_str), line_data))
.collect::<Vec<_>>();
covfun_entries.sort_by(|a, b| {
// Sort entries primarily by name, to help make the order consistent
// across platforms and relatively insensitive to changes.
// (Sadly we can't use `sort_by_key` because we would need to return references.)
Ord::cmp(&a.0, &b.0)
.then_with(|| Ord::cmp(&a.1.is_used, &b.1.is_used))
.then_with(|| Ord::cmp(a.1.payload.as_slice(), b.1.payload.as_slice()))
});
for (name, line_data) in &covfun_entries {
let name = name.unwrap_or("(unknown)");
let unused = if line_data.is_used { "" } else { " (unused)" };
println!("Function name: {name}{unused}");
let payload: &[u8] = &line_data.payload;
println!("Raw bytes ({len}): 0x{payload:02x?}", len = payload.len());
let mut parser = Parser::new(payload);
let num_files = parser.read_uleb128_u32()?;
println!("Number of files: {num_files}");
for i in 0..num_files {
let global_file_id = parser.read_uleb128_u32()?;
println!("- file {i} => global file {global_file_id}");
}
let num_expressions = parser.read_uleb128_u32()?;
println!("Number of expressions: {num_expressions}");
let mut expression_resolver = ExpressionResolver::new();
for i in 0..num_expressions {
let lhs = parser.read_simple_term()?;
let rhs = parser.read_simple_term()?;
println!("- expression {i} operands: lhs = {lhs:?}, rhs = {rhs:?}");
expression_resolver.push_operands(lhs, rhs);
}
for i in 0..num_files {
let num_mappings = parser.read_uleb128_u32()?;
println!("Number of file {i} mappings: {num_mappings}");
for _ in 0..num_mappings {
let (kind, region) = parser.read_mapping_kind_and_region()?;
println!("- {kind:?} at {region:?}");
match kind {
// Also print expression mappings in resolved form.
MappingKind::Code(term @ CovTerm::Expression { .. })
| MappingKind::Gap(term @ CovTerm::Expression { .. }) => {
println!(" = {}", expression_resolver.format_term(term));
}
// If the mapping is a branch region, print both of its arms
// in resolved form (even if they aren't expressions).
MappingKind::Branch { r#true, r#false } => {
println!(" true = {}", expression_resolver.format_term(r#true));
println!(" false = {}", expression_resolver.format_term(r#false));
}
_ => (),
}
}
}
parser.ensure_empty()?;
println!();
}
Ok(())
}
struct CovfunLineData {
name_hash: u64,
is_used: bool,
payload: Vec<u8>,
}
/// Checks a line of LLVM IR assembly to see if it contains an `__llvm_covfun`
/// entry, and if so extracts relevant data in a `CovfunLineData`.
fn covfun_line_data(line: &str) -> Option<CovfunLineData> {
let re = {
// We cheat a little bit and match variable names `@__covrec_[HASH]u`
// rather than the section name, because the section name is harder to
// extract and differs across Linux/Windows/macOS. We also extract the
// symbol name hash from the variable name rather than the data, since
// it's easier and both should match.
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(
r#"^@__covrec_(?<name_hash>[0-9A-Z]+)(?<is_used>u)? = .*\[[0-9]+ x i8\] c"(?<payload>[^"]*)".*$"#,
)
.unwrap()
})
};
let captures = re.captures(line)?;
let name_hash = u64::from_str_radix(&captures["name_hash"], 16).unwrap();
let is_used = captures.name("is_used").is_some();
let payload = unescape_llvm_string_contents(&captures["payload"]);
Some(CovfunLineData { name_hash, is_used, payload })
}
// Extra parser methods only needed when parsing `covfun` payloads.
impl<'a> Parser<'a> {
fn read_simple_term(&mut self) -> anyhow::Result<CovTerm> {
let raw_term = self.read_uleb128_u32()?;
CovTerm::decode(raw_term).context("decoding term")
}
fn read_mapping_kind_and_region(&mut self) -> anyhow::Result<(MappingKind, MappingRegion)> {
let mut kind = self.read_raw_mapping_kind()?;
let mut region = self.read_raw_mapping_region()?;
const HIGH_BIT: u32 = 1u32 << 31;
if region.end_column & HIGH_BIT != 0 {
region.end_column &= !HIGH_BIT;
kind = match kind {
MappingKind::Code(term) => MappingKind::Gap(term),
// LLVM's coverage mapping reader will actually handle this
// case without complaint, but the result is almost certainly
// a meaningless implementation artifact.
_ => return Err(anyhow!("unexpected base kind for gap region: {kind:?}")),
}
}
Ok((kind, region))
}
fn read_raw_mapping_kind(&mut self) -> anyhow::Result<MappingKind> {
let raw_mapping_kind = self.read_uleb128_u32()?;
if let Some(term) = CovTerm::decode(raw_mapping_kind) {
return Ok(MappingKind::Code(term));
}
assert_eq!(raw_mapping_kind & 0b11, 0);
assert_ne!(raw_mapping_kind, 0);
let (high, is_expansion) = (raw_mapping_kind >> 3, raw_mapping_kind & 0b100 != 0);
if is_expansion {
Ok(MappingKind::Expansion(high))
} else {
match high {
0 => unreachable!("zero kind should have already been handled as a code mapping"),
2 => Ok(MappingKind::Skip),
4 => {
let r#true = self.read_simple_term()?;
let r#false = self.read_simple_term()?;
Ok(MappingKind::Branch { r#true, r#false })
}
_ => Err(anyhow!("unknown mapping kind: {raw_mapping_kind:#x}")),
}
}
}
fn read_raw_mapping_region(&mut self) -> anyhow::Result<MappingRegion> {
let start_line_offset = self.read_uleb128_u32()?;
let start_column = self.read_uleb128_u32()?;
let end_line_offset = self.read_uleb128_u32()?;
let end_column = self.read_uleb128_u32()?;
Ok(MappingRegion { start_line_offset, start_column, end_line_offset, end_column })
}
}
/// Enum that can hold a constant zero value, the ID of an physical coverage
/// counter, or the ID (and operation) of a coverage-counter expression.
///
/// Terms are used as the operands of coverage-counter expressions, as the arms
/// of branch mappings, and as the value of code/gap mappings.
#[derive(Clone, Copy, Debug)]
pub(crate) enum CovTerm {
Zero,
Counter(u32),
Expression(u32, Op),
}
/// Operator (addition or subtraction) used by an expression.
#[derive(Clone, Copy, Debug)]
pub(crate) enum Op {
Sub,
Add,
}
impl CovTerm {
pub(crate) fn decode(input: u32) -> Option<Self> {
let (high, tag) = (input >> 2, input & 0b11);
match tag {
0b00 if high == 0 => Some(Self::Zero),
0b01 => Some(Self::Counter(high)),
0b10 => Some(Self::Expression(high, Op::Sub)),
0b11 => Some(Self::Expression(high, Op::Add)),
// When reading expression operands or branch arms, the LLVM coverage
// mapping reader will always interpret a `0b00` tag as a zero
// term, even when the high bits are non-zero.
// We treat that case as failure instead, so that this code can be
// shared by the full mapping-kind reader as well.
_ => None,
}
}
}
#[derive(Debug)]
enum MappingKind {
Code(CovTerm),
Gap(CovTerm),
Expansion(u32),
Skip,
// Using raw identifiers here makes the dump output a little bit nicer
// (via the derived Debug), at the expense of making this tool's source
// code a little bit uglier.
Branch { r#true: CovTerm, r#false: CovTerm },
}
struct MappingRegion {
/// Offset of this region's start line, relative to the *start line* of
/// the *previous mapping* (or 0). Line numbers are 1-based.
start_line_offset: u32,
/// This region's start column, absolute and 1-based.
start_column: u32,
/// Offset of this region's end line, relative to the *this mapping's*
/// start line. Line numbers are 1-based.
end_line_offset: u32,
/// This region's end column, absolute, 1-based, and exclusive.
///
/// If the highest bit is set, that bit is cleared and the associated
/// mapping becomes a gap region mapping.
end_column: u32,
}
impl Debug for MappingRegion {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"(prev + {}, {}) to (start + {}, {})",
self.start_line_offset, self.start_column, self.end_line_offset, self.end_column
)
}
}
/// Helper type that prints expressions in a "resolved" form, so that
/// developers reading the dump don't need to resolve expressions by hand.
struct ExpressionResolver {
operands: Vec<(CovTerm, CovTerm)>,
}
impl ExpressionResolver {
fn new() -> Self {
Self { operands: Vec::new() }
}
fn push_operands(&mut self, lhs: CovTerm, rhs: CovTerm) {
self.operands.push((lhs, rhs));
}
fn format_term(&self, term: CovTerm) -> String {
let mut output = String::new();
self.write_term(&mut output, term);
output
}
fn write_term(&self, output: &mut String, term: CovTerm) {
match term {
CovTerm::Zero => output.push_str("Zero"),
CovTerm::Counter(id) => write!(output, "c{id}").unwrap(),
CovTerm::Expression(id, op) => {
let (lhs, rhs) = self.operands[id as usize];
let op = match op {
Op::Sub => "-",
Op::Add => "+",
};
output.push('(');
self.write_term(output, lhs);
write!(output, " {op} ").unwrap();
self.write_term(output, rhs);
output.push(')');
}
}
}
}

View File

@ -0,0 +1,17 @@
mod covfun;
mod parser;
mod prf_names;
fn main() -> anyhow::Result<()> {
use anyhow::Context as _;
let args = std::env::args().collect::<Vec<_>>();
let llvm_ir_path = args.get(1).context("LLVM IR file not specified")?;
let llvm_ir = std::fs::read_to_string(llvm_ir_path).context("couldn't read LLVM IR file")?;
let function_names = crate::prf_names::make_function_names_table(&llvm_ir)?;
crate::covfun::dump_covfun_mappings(&llvm_ir, &function_names)?;
Ok(())
}

View File

@ -0,0 +1,80 @@
#[cfg(test)]
mod tests;
use anyhow::ensure;
use regex::bytes;
use std::sync::OnceLock;
/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
/// backslash escapes and returns a vector containing the resulting byte string.
pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
let escape_re = {
static RE: OnceLock<bytes::Regex> = OnceLock::new();
// LLVM IR supports two string escapes: `\\` and `\xx`.
RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
};
fn u8_from_hex_digits(digits: &[u8]) -> u8 {
// We know that the input contains exactly 2 hex digits, so these calls
// should never fail.
assert_eq!(digits.len(), 2);
let digits = std::str::from_utf8(digits).unwrap();
u8::from_str_radix(digits, 16).unwrap()
}
escape_re
.replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
let byte = match captures.get(1) {
None => b'\\',
Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
};
[byte]
})
.into_owned()
}
pub(crate) struct Parser<'a> {
rest: &'a [u8],
}
impl<'a> Parser<'a> {
pub(crate) fn new(input: &'a [u8]) -> Self {
Self { rest: input }
}
pub(crate) fn ensure_empty(self) -> anyhow::Result<()> {
ensure!(self.rest.is_empty(), "unparsed bytes: 0x{:02x?}", self.rest);
Ok(())
}
pub(crate) fn read_n_bytes(&mut self, n: usize) -> anyhow::Result<&'a [u8]> {
ensure!(n <= self.rest.len());
let (bytes, rest) = self.rest.split_at(n);
self.rest = rest;
Ok(bytes)
}
pub(crate) fn read_uleb128_u32(&mut self) -> anyhow::Result<u32> {
self.read_uleb128_u64_and_convert()
}
pub(crate) fn read_uleb128_usize(&mut self) -> anyhow::Result<usize> {
self.read_uleb128_u64_and_convert()
}
fn read_uleb128_u64_and_convert<T>(&mut self) -> anyhow::Result<T>
where
T: TryFrom<u64> + 'static,
T::Error: std::error::Error + Send + Sync,
{
let mut temp_rest = self.rest;
let raw_value: u64 = leb128::read::unsigned(&mut temp_rest)?;
let converted_value = T::try_from(raw_value)?;
// Only update `self.rest` if the above steps succeeded, so that the
// parser position can be used for error reporting if desired.
self.rest = temp_rest;
Ok(converted_value)
}
}

View File

@ -0,0 +1,38 @@
use super::unescape_llvm_string_contents;
// WARNING: These tests don't necessarily run in CI, and were mainly used to
// help track down problems when originally developing this tool.
// (The tool is still tested indirectly by snapshot tests that rely on it.)
// Tests for `unescape_llvm_string_contents`:
#[test]
fn unescape_empty() {
assert_eq!(unescape_llvm_string_contents(""), &[]);
}
#[test]
fn unescape_noop() {
let input = "The quick brown fox jumps over the lazy dog.";
assert_eq!(unescape_llvm_string_contents(input), input.as_bytes());
}
#[test]
fn unescape_backslash() {
let input = r"\\Hello\\world\\";
assert_eq!(unescape_llvm_string_contents(input), r"\Hello\world\".as_bytes());
}
#[test]
fn unescape_hex() {
let input = r"\01\02\03\04\0a\0b\0C\0D\fd\fE\FF";
let expected: &[u8] = &[0x01, 0x02, 0x03, 0x04, 0x0a, 0x0b, 0x0c, 0x0d, 0xfd, 0xfe, 0xff];
assert_eq!(unescape_llvm_string_contents(input), expected);
}
#[test]
fn unescape_mixed() {
let input = r"\\01.\5c\5c";
let expected: &[u8] = br"\01.\\";
assert_eq!(unescape_llvm_string_contents(input), expected);
}

View File

@ -0,0 +1,87 @@
use crate::parser::{unescape_llvm_string_contents, Parser};
use anyhow::{anyhow, ensure};
use regex::Regex;
use std::collections::HashMap;
use std::sync::OnceLock;
/// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names`
/// entries, decodes them, and creates a table that maps name hash values to
/// (demangled) function names.
pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result<HashMap<u64, String>> {
fn prf_names_payload(line: &str) -> Option<&str> {
let re = {
// We cheat a little bit and match the variable name `@__llvm_prf_nm`
// rather than the section name, because the section name is harder
// to extract and differs across Linux/Windows/macOS.
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"^@__llvm_prf_nm =.*\[[0-9]+ x i8\] c"([^"]*)".*$"#).unwrap()
})
};
let payload = re.captures(line)?.get(1).unwrap().as_str();
Some(payload)
}
/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
/// 64 bits as a way to associate data stored in different tables/sections.
fn truncated_md5(bytes: &[u8]) -> u64 {
use md5::{Digest, Md5};
let mut hasher = Md5::new();
hasher.update(bytes);
let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
// The truncated hash is explicitly little-endian, regardless of host
// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
u64::from_le_bytes(hash)
}
fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result<String> {
// In practice, raw symbol names should always be ASCII.
let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?;
match rustc_demangle::try_demangle(symbol_name_str) {
Ok(d) => Ok(format!("{d:#}")),
// If demangling failed, don't treat it as an error. This lets us
// run the dump tool against non-Rust coverage maps produced by
// `clang`, for testing purposes.
Err(_) => Ok(format!("(couldn't demangle) {symbol_name_str}")),
}
}
let mut map = HashMap::new();
for payload in llvm_ir.lines().filter_map(prf_names_payload).map(unescape_llvm_string_contents)
{
let mut parser = Parser::new(&payload);
let uncompressed_len = parser.read_uleb128_usize()?;
let compressed_len = parser.read_uleb128_usize()?;
let uncompressed_bytes_vec;
let uncompressed_bytes: &[u8] = if compressed_len == 0 {
// The symbol name bytes are uncompressed, so read them directly.
parser.read_n_bytes(uncompressed_len)?
} else {
// The symbol name bytes are compressed, so read and decompress them.
let compressed_bytes = parser.read_n_bytes(compressed_len)?;
uncompressed_bytes_vec = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit(
compressed_bytes,
uncompressed_len,
)
.map_err(|e| anyhow!("{e:?}"))?;
ensure!(uncompressed_bytes_vec.len() == uncompressed_len);
&uncompressed_bytes_vec
};
// Symbol names in the payload are separated by `0x01` bytes.
for raw_name in uncompressed_bytes.split(|&b| b == 0x01) {
let hash = truncated_md5(raw_name);
let demangled = demangle_if_able(raw_name)?;
map.insert(hash, demangled);
}
parser.ensure_empty()?;
}
Ok(map)
}