diff --git a/Cargo.lock b/Cargo.lock index 9f64aa44314..eadaf721f02 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -563,6 +563,7 @@ dependencies = [ "libc", "num-integer", "num-traits", + "serde", "time", "winapi", ] @@ -712,6 +713,16 @@ dependencies = [ "rustc-semver", ] +[[package]] +name = "collect-license-metadata" +version = "0.1.0" +dependencies = [ + "anyhow", + "serde", + "serde_json", + "spdx-rs", +] + [[package]] name = "color-eyre" version = "0.6.2" @@ -4628,6 +4639,35 @@ dependencies = [ "winapi", ] +[[package]] +name = "spdx-expression" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d7ac03c67c572d85049d6db815e20a4a19b41b3d5cca732ac582342021ad77" +dependencies = [ + "nom", + "serde", + "thiserror", + "tracing", +] + +[[package]] +name = "spdx-rs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3c02f6eb7e7b4100c272f685a9ccaccaab302324e8c7ec3e2ee72340fb29ff3" +dependencies = [ + "chrono", + "log", + "nom", + "serde", + "spdx-expression", + "strum", + "strum_macros", + "thiserror", + "uuid", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -4731,6 +4771,25 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" + +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" version = "1.0.102" @@ -5357,6 +5416,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8772a4ccbb4e89959023bc5b7cb8623a795caa7092d99f3aa9501b9484d4557d" +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom 0.2.0", +] + [[package]] name = "valuable" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 13a98eedde8..ddaf9b41f86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ members = [ "src/tools/bump-stage0", "src/tools/replace-version-placeholder", "src/tools/lld-wrapper", + "src/tools/collect-license-metadata", ] exclude = [ diff --git a/src/bootstrap/builder.rs b/src/bootstrap/builder.rs index 8d999302a6d..803b1c266f3 100644 --- a/src/bootstrap/builder.rs +++ b/src/bootstrap/builder.rs @@ -753,6 +753,7 @@ macro_rules! describe { run::BumpStage0, run::ReplaceVersionPlaceholder, run::Miri, + run::CollectLicenseMetadata, ), // These commands either don't use paths, or they're special-cased in Build::build() Kind::Clean | Kind::Format | Kind::Setup => vec![], diff --git a/src/bootstrap/run.rs b/src/bootstrap/run.rs index d49b41c5132..8cbfe3ebab5 100644 --- a/src/bootstrap/run.rs +++ b/src/bootstrap/run.rs @@ -1,3 +1,4 @@ +use std::path::PathBuf; use std::process::Command; use crate::builder::{Builder, RunConfig, ShouldRun, Step}; @@ -189,3 +190,35 @@ fn run(self, builder: &Builder<'_>) { builder.run(&mut miri); } } + +#[derive(Debug, PartialOrd, Ord, Copy, Clone, Hash, PartialEq, Eq)] +pub struct CollectLicenseMetadata; + +impl Step for CollectLicenseMetadata { + type Output = PathBuf; + const ONLY_HOSTS: bool = true; + + fn should_run(run: ShouldRun<'_>) -> ShouldRun<'_> { + run.path("src/tools/collect-license-metadata") + } + + fn make_run(run: RunConfig<'_>) { + run.builder.ensure(CollectLicenseMetadata); + } + + fn run(self, builder: &Builder<'_>) -> Self::Output { + let Some(reuse) = &builder.config.reuse else { + panic!("REUSE is required to collect the license metadata"); + }; + + // Temporary location, it will be moved to src/etc once it's accurate. + let dest = builder.out.join("license-metadata.json"); + + let mut cmd = builder.tool_cmd(Tool::CollectLicenseMetadata); + cmd.env("REUSE_EXE", reuse); + cmd.env("DEST", &dest); + builder.run(&mut cmd); + + dest + } +} diff --git a/src/bootstrap/tool.rs b/src/bootstrap/tool.rs index ba329ea6c75..4dd9a40dcb3 100644 --- a/src/bootstrap/tool.rs +++ b/src/bootstrap/tool.rs @@ -380,6 +380,7 @@ fn run(self, builder: &Builder<'_>) -> PathBuf { HtmlChecker, "src/tools/html-checker", "html-checker"; BumpStage0, "src/tools/bump-stage0", "bump-stage0"; ReplaceVersionPlaceholder, "src/tools/replace-version-placeholder", "replace-version-placeholder"; + CollectLicenseMetadata, "src/tools/collect-license-metadata", "collect-license-metadata"; ); #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)] diff --git a/src/tools/collect-license-metadata/Cargo.toml b/src/tools/collect-license-metadata/Cargo.toml new file mode 100644 index 00000000000..d0820cfc2a0 --- /dev/null +++ b/src/tools/collect-license-metadata/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "collect-license-metadata" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.65" +serde = { version = "1.0.147", features = ["derive"] } +serde_json = "1.0.85" +spdx-rs = "0.5.1" diff --git a/src/tools/collect-license-metadata/src/licenses.rs b/src/tools/collect-license-metadata/src/licenses.rs new file mode 100644 index 00000000000..34aabc87301 --- /dev/null +++ b/src/tools/collect-license-metadata/src/licenses.rs @@ -0,0 +1,37 @@ +use std::collections::HashMap; + +pub(crate) struct LicensesInterner { + by_id: Vec, + by_struct: HashMap, +} + +impl LicensesInterner { + pub(crate) fn new() -> Self { + LicensesInterner { by_id: Vec::new(), by_struct: HashMap::new() } + } + + pub(crate) fn intern(&mut self, license: License) -> LicenseId { + if let Some(id) = self.by_struct.get(&license) { + LicenseId(*id) + } else { + let id = self.by_id.len(); + self.by_id.push(license.clone()); + self.by_struct.insert(license, id); + LicenseId(id) + } + } + + pub(crate) fn resolve(&self, id: LicenseId) -> &License { + &self.by_id[id.0] + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)] +#[serde(transparent)] +pub(crate) struct LicenseId(usize); + +#[derive(Clone, Hash, PartialEq, Eq, serde::Serialize)] +pub(crate) struct License { + pub(crate) spdx: String, + pub(crate) copyright: Vec, +} diff --git a/src/tools/collect-license-metadata/src/main.rs b/src/tools/collect-license-metadata/src/main.rs new file mode 100644 index 00000000000..0864408fd53 --- /dev/null +++ b/src/tools/collect-license-metadata/src/main.rs @@ -0,0 +1,30 @@ +mod licenses; +mod path_tree; +mod reuse; + +use crate::licenses::LicensesInterner; +use anyhow::Error; +use std::path::PathBuf; + +fn main() -> Result<(), Error> { + let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into(); + let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into(); + + let mut interner = LicensesInterner::new(); + let paths = crate::reuse::collect(&reuse_exe, &mut interner)?; + + let mut tree = crate::path_tree::build(paths); + tree.simplify(); + + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write( + &dest, + &serde_json::to_vec_pretty(&serde_json::json!({ + "files": crate::path_tree::strip_interning(tree, &interner), + }))?, + )?; + + Ok(()) +} diff --git a/src/tools/collect-license-metadata/src/path_tree.rs b/src/tools/collect-license-metadata/src/path_tree.rs new file mode 100644 index 00000000000..6278970d813 --- /dev/null +++ b/src/tools/collect-license-metadata/src/path_tree.rs @@ -0,0 +1,292 @@ +//! Tools like REUSE output per-file licensing information, but we need to condense it in the +//! minimum amount of data that still represents the same licensing metadata. This module is +//! responsible for that, by turning the list of paths into a tree and executing simplification +//! passes over the tree to remove redundant information. + +use crate::licenses::{License, LicenseId, LicensesInterner}; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; + +#[derive(serde::Serialize)] +#[serde(rename_all = "kebab-case", tag = "type")] +pub(crate) enum Node { + Root { childs: Vec> }, + Directory { name: PathBuf, childs: Vec>, license: Option }, + File { name: PathBuf, license: L }, + FileGroup { names: Vec, license: L }, + Empty, +} + +impl Node { + pub(crate) fn simplify(&mut self) { + self.merge_directories(); + self.collapse_in_licensed_directories(); + self.merge_directory_licenses(); + self.merge_file_groups(); + self.remove_empty(); + } + + /// Initially, trees are built by the build() function with each file practically having a + /// separate directory tree, like so: + /// + /// ```text + /// ┌─► ./ ──► compiler/ ──► rustc/ ──► src/ ──► main.rs + /// │ + /// ─┼─► ./ ──► compiler/ ──► rustc/ ──► Cargo.toml + /// │ + /// └─► ./ ──► library/ ───► std/ ──► Cargo.toml + /// ``` + /// + /// This pass is responsible for turning that into a proper directory tree: + /// + /// ```text + /// ┌─► compiler/ ──► rustc/ ──┬─► src/ ──► main.rs + /// │ │ + /// ──► ./ ──┤ └─► Cargo.toml + /// │ + /// └─► library/ ───► std/ ──► Cargo.toml + /// ``` + fn merge_directories(&mut self) { + match self { + Node::Root { childs } | Node::Directory { childs, license: None, .. } => { + let mut directories = BTreeMap::new(); + let mut files = Vec::new(); + + for child in childs.drain(..) { + match child { + Node::Directory { name, mut childs, license: None } => { + directories.entry(name).or_insert_with(Vec::new).append(&mut childs); + } + file @ Node::File { .. } => { + files.push(file); + } + Node::Empty => {} + Node::Root { .. } => { + panic!("can't have a root inside another element"); + } + Node::FileGroup { .. } => { + panic!("FileGroup should not be present at this stage"); + } + Node::Directory { license: Some(_), .. } => { + panic!("license should not be set at this stage"); + } + } + } + + childs.extend(directories.into_iter().map(|(name, childs)| Node::Directory { + name, + childs, + license: None, + })); + childs.append(&mut files); + + for child in &mut *childs { + child.merge_directories(); + } + } + Node::Empty => {} + Node::File { .. } => {} + Node::FileGroup { .. } => { + panic!("FileGroup should not be present at this stage"); + } + Node::Directory { license: Some(_), .. } => { + panic!("license should not be set at this stage"); + } + } + } + + /// In our codebase, most files in a directory have the same license as the other files in that + /// same directory, so it's redundant to store licensing metadata for all the files. Instead, + /// we can add a license for a whole directory, and only record the exceptions to a directory + /// licensing metadata. + /// + /// We cannot instead record only the difference to Rust's standard licensing, as the majority + /// of the files in our repository are *not* licensed under Rust's standard licensing due to + /// our inclusion of LLVM. + fn collapse_in_licensed_directories(&mut self) { + match self { + Node::Directory { childs, license, .. } => { + for child in &mut *childs { + child.collapse_in_licensed_directories(); + } + + let mut licenses_count = BTreeMap::new(); + for child in &*childs { + let Some(license) = child.license() else { continue }; + *licenses_count.entry(license).or_insert(0) += 1; + } + + let most_popular_license = licenses_count + .into_iter() + .max_by_key(|(_, count)| *count) + .map(|(license, _)| license); + + if let Some(most_popular_license) = most_popular_license { + childs.retain(|child| child.license() != Some(most_popular_license)); + *license = Some(most_popular_license); + } + } + Node::Root { childs } => { + for child in &mut *childs { + child.collapse_in_licensed_directories(); + } + } + Node::File { .. } => {} + Node::FileGroup { .. } => {} + Node::Empty => {} + } + } + + /// Reduce the depth of the tree by merging subdirectories with the same license as their + /// parent directory into their parent, and adjusting the paths of the childs accordingly. + fn merge_directory_licenses(&mut self) { + match self { + Node::Root { childs } => { + for child in &mut *childs { + child.merge_directory_licenses(); + } + } + Node::Directory { childs, license, .. } => { + let mut to_add = Vec::new(); + for child in &mut *childs { + child.merge_directory_licenses(); + + let Node::Directory { + name: child_name, + childs: child_childs, + license: child_license, + } = child else { continue }; + + if child_license != license { + continue; + } + for mut child_child in child_childs.drain(..) { + match &mut child_child { + Node::Root { .. } => { + panic!("can't have a root inside another element"); + } + Node::FileGroup { .. } => { + panic!("FileGroup should not be present at this stage"); + } + Node::Directory { name: child_child_name, .. } => { + *child_child_name = child_name.join(&child_child_name); + } + Node::File { name: child_child_name, .. } => { + *child_child_name = child_name.join(&child_child_name); + } + Node::Empty => {} + } + to_add.push(child_child); + } + + *child = Node::Empty; + } + childs.append(&mut to_add); + } + Node::Empty => {} + Node::File { .. } => {} + Node::FileGroup { .. } => {} + } + } + + /// This pass groups multiple files in a directory with the same license into a single + /// "FileGroup", so that the license of all those files can be reported as a group. + /// + /// Crucially this pass runs after collapse_in_licensed_directories, so the most common license + /// will already be marked as the directory's license and won't be turned into a group. + fn merge_file_groups(&mut self) { + match self { + Node::Root { childs } | Node::Directory { childs, .. } => { + let mut grouped = BTreeMap::new(); + + for child in &mut *childs { + child.merge_file_groups(); + if let Node::File { name, license } = child { + grouped.entry(*license).or_insert_with(Vec::new).push(name.clone()); + *child = Node::Empty; + } + } + + for (license, mut names) in grouped.into_iter() { + if names.len() == 1 { + childs.push(Node::File { license, name: names.pop().unwrap() }); + } else { + childs.push(Node::FileGroup { license, names }); + } + } + } + Node::File { .. } => {} + Node::FileGroup { .. } => panic!("FileGroup should not be present at this stage"), + Node::Empty => {} + } + } + + /// Some nodes were replaced with Node::Empty to mark them for deletion. As the last step, make + /// sure to remove them from the tree. + fn remove_empty(&mut self) { + match self { + Node::Root { childs } | Node::Directory { childs, .. } => { + for child in &mut *childs { + child.remove_empty(); + } + childs.retain(|child| !matches!(child, Node::Empty)); + } + Node::FileGroup { .. } => {} + Node::File { .. } => {} + Node::Empty => {} + } + } + + fn license(&self) -> Option { + match self { + Node::Directory { childs, license: Some(license), .. } if childs.is_empty() => { + Some(*license) + } + Node::File { license, .. } => Some(*license), + _ => None, + } + } +} + +pub(crate) fn build(mut input: Vec<(PathBuf, LicenseId)>) -> Node { + let mut childs = Vec::new(); + + // Ensure reproducibility of all future steps. + input.sort(); + + for (path, license) in input { + let mut node = Node::File { name: path.file_name().unwrap().into(), license }; + for component in path.parent().unwrap_or_else(|| Path::new(".")).components().rev() { + node = Node::Directory { + name: component.as_os_str().into(), + childs: vec![node], + license: None, + }; + } + + childs.push(node); + } + + Node::Root { childs } +} + +pub(crate) fn strip_interning( + node: Node, + interner: &LicensesInterner, +) -> Node<&License> { + match node { + Node::Root { childs } => Node::Root { + childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(), + }, + Node::Directory { name, childs, license } => Node::Directory { + childs: childs.into_iter().map(|child| strip_interning(child, interner)).collect(), + license: license.map(|license| interner.resolve(license)), + name, + }, + Node::File { name, license } => Node::File { name, license: interner.resolve(license) }, + Node::FileGroup { names, license } => { + Node::FileGroup { names, license: interner.resolve(license) } + } + Node::Empty => Node::Empty, + } +} diff --git a/src/tools/collect-license-metadata/src/reuse.rs b/src/tools/collect-license-metadata/src/reuse.rs new file mode 100644 index 00000000000..d6b3772ba51 --- /dev/null +++ b/src/tools/collect-license-metadata/src/reuse.rs @@ -0,0 +1,49 @@ +use crate::licenses::{License, LicenseId, LicensesInterner}; +use anyhow::Error; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Instant; + +pub(crate) fn collect( + reuse_exe: &Path, + interner: &mut LicensesInterner, +) -> Result, Error> { + eprintln!("gathering license information from REUSE"); + let start = Instant::now(); + let raw = &obtain_spdx_document(reuse_exe)?; + eprintln!("finished gathering the license information from REUSE in {:.2?}", start.elapsed()); + + let document = spdx_rs::parsers::spdx_from_tag_value(&raw)?; + + let mut result = Vec::new(); + for file in document.file_information { + let license = interner.intern(License { + spdx: file.concluded_license.to_string(), + copyright: file.copyright_text.split('\n').map(|s| s.into()).collect(), + }); + + result.push((file.file_name.into(), license)); + } + + Ok(result) +} + +fn obtain_spdx_document(reuse_exe: &Path) -> Result { + let output = Command::new(reuse_exe) + .args(&["spdx", "--add-license-concluded", "--creator-person=bors"]) + .stdout(Stdio::piped()) + .spawn()? + .wait_with_output()?; + + if !output.status.success() { + eprintln!(); + eprintln!("Note that Rust requires some REUSE features that might not be present in the"); + eprintln!("release you're using. Make sure your REUSE release includes these PRs:"); + eprintln!(); + eprintln!(" - https://github.com/fsfe/reuse-tool/pull/623"); + eprintln!(); + anyhow::bail!("collecting licensing information with REUSE failed"); + } + + Ok(String::from_utf8(output.stdout)?) +}