From 4e0ac38b52ba770d338ab6e1b05a6179f78697bb Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Mon, 12 Jun 2023 15:39:50 -0500
Subject: [PATCH] refactor(embedded): Switch to `syn` for parsing doc comments

The hope is this will result in more resilient comment handling, being
more consistent with rustdoc.

I also hoped for less code but `syn` is doing less than I had expected,
requiring us to copy code over from other parts of rust.  It seems every
proc macro has to do this but there is no guide to it, so they all do it
differently, only covering the cases they thought to test for.

Note that this still won't support `include_str!()`.
---
 Cargo.lock                      |   1 -
 Cargo.toml                      |   2 -
 src/cargo/util/toml/embedded.rs | 441 ++++++++++++++++++++++----------
 3 files changed, 300 insertions(+), 144 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e90f95b99..c11da053c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -334,7 +334,6 @@ dependencies = [
  "pretty_env_logger",
  "pulldown-cmark",
  "rand",
- "regex",
  "rustfix",
  "same-file",
  "semver",
diff --git a/Cargo.toml b/Cargo.toml
index d6d7e76d6..0e788a6af 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -67,7 +67,6 @@ pretty_env_logger = "0.4"
 proptest = "1.1.0"
 pulldown-cmark = { version = "0.9.2", default-features = false }
 rand = "0.8.5"
-regex = "1.8.3"
 rustfix = "0.6.0"
 same-file = "1.0.6"
 security-framework = "2.0.0"
@@ -153,7 +152,6 @@ pathdiff.workspace = true
 pretty_env_logger = { workspace = true, optional = true }
 pulldown-cmark.workspace = true
 rand.workspace = true
-regex.workspace = true
 rustfix.workspace = true
 semver.workspace = true
 serde = { workspace = true, features = ["derive"] }
diff --git a/src/cargo/util/toml/embedded.rs b/src/cargo/util/toml/embedded.rs
index 2200c53f5..5ecb9112d 100644
--- a/src/cargo/util/toml/embedded.rs
+++ b/src/cargo/util/toml/embedded.rs
@@ -247,156 +247,271 @@ fn write_if_changed(path: &std::path::Path, new: &str) -> CargoResult<()> {
 
 /// Locates a "code block manifest" in Rust source.
 fn extract_comment(input: &str) -> CargoResult<String> {
-    let re_crate_comment = regex::Regex::new(
-        // We need to find the first `/*!` or `//!` that *isn't* preceded by something that would
-        // make it apply to anything other than the crate itself.  Because we can't do this
-        // accurately, we'll just require that the doc-comment is the *first* thing in the file
-        // (after the optional shebang).
-        r"(?x)(^\s*|^\#![^\[].*?(\r\n|\n))(/\*!|//(!|/))",
-    )
-    .unwrap();
-    let re_margin = regex::Regex::new(r"^\s*\*( |$)").unwrap();
-    let re_space = regex::Regex::new(r"^(\s+)").unwrap();
-    let re_nesting = regex::Regex::new(r"/\*|\*/").unwrap();
-    let re_comment = regex::Regex::new(r"^\s*//(!|/)").unwrap();
-
-    fn n_leading_spaces(s: &str, n: usize) -> anyhow::Result<()> {
-        if !s.chars().take(n).all(|c| c == ' ') {
-            anyhow::bail!("leading {n:?} chars aren't all spaces: {s:?}")
+    let mut doc_fragments = Vec::new();
+    let file = syn::parse_file(input)?;
+    // HACK: `syn` doesn't tell us what kind of comment was used, so infer it from how many
+    // attributes were used
+    let kind = if 1 < file
+        .attrs
+        .iter()
+        .filter(|attr| attr.meta.path().is_ident("doc"))
+        .count()
+    {
+        CommentKind::Line
+    } else {
+        CommentKind::Block
+    };
+    for attr in &file.attrs {
+        if attr.meta.path().is_ident("doc") {
+            doc_fragments.push(DocFragment::new(attr, kind)?);
         }
-        Ok(())
+    }
+    if doc_fragments.is_empty() {
+        anyhow::bail!("no doc-comment found");
+    }
+    unindent_doc_fragments(&mut doc_fragments);
+
+    let mut doc_comment = String::new();
+    for frag in &doc_fragments {
+        add_doc_fragment(&mut doc_comment, frag);
     }
 
-    /// Returns a slice of the input string with the leading shebang, if there is one, omitted.
-    fn strip_shebang(s: &str) -> &str {
-        let re_shebang = regex::Regex::new(r"^#![^\[].*?(\r\n|\n)").unwrap();
-        re_shebang.find(s).map(|m| &s[m.end()..]).unwrap_or(s)
+    Ok(doc_comment)
+}
+
+/// A `#[doc]`
+#[derive(Clone, Debug)]
+struct DocFragment {
+    /// The attribute value
+    doc: String,
+    /// Indentation used within `doc
+    indent: usize,
+}
+
+impl DocFragment {
+    fn new(attr: &syn::Attribute, kind: CommentKind) -> CargoResult<Self> {
+        let syn::Meta::NameValue(nv) = &attr.meta else {
+            anyhow::bail!("unsupported attr meta for {:?}", attr.meta.path())
+        };
+        let syn::Expr::Lit(syn::ExprLit { lit: syn::Lit::Str(lit), .. }) = &nv.value else {
+            anyhow::bail!("only string literals are supported")
+        };
+        Ok(Self {
+            doc: beautify_doc_string(lit.value(), kind),
+            indent: 0,
+        })
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub enum CommentKind {
+    Line,
+    Block,
+}
+
+/// Makes a doc string more presentable to users.
+/// Used by rustdoc and perhaps other tools, but not by rustc.
+///
+/// See `rustc_ast/util/comments.rs`
+fn beautify_doc_string(data: String, kind: CommentKind) -> String {
+    fn get_vertical_trim(lines: &[&str]) -> Option<(usize, usize)> {
+        let mut i = 0;
+        let mut j = lines.len();
+        // first line of all-stars should be omitted
+        if !lines.is_empty() && lines[0].chars().all(|c| c == '*') {
+            i += 1;
+        }
+
+        // like the first, a last line of all stars should be omitted
+        if j > i && !lines[j - 1].is_empty() && lines[j - 1].chars().all(|c| c == '*') {
+            j -= 1;
+        }
+
+        if i != 0 || j != lines.len() {
+            Some((i, j))
+        } else {
+            None
+        }
     }
 
-    // First, we will look for and slice out a contiguous, inner doc-comment which must be *the
-    // very first thing* in the file.  `#[doc(...)]` attributes *are not supported*.  Multiple
-    // single-line comments cannot have any blank lines between them.
-    let input = strip_shebang(input); // `re_crate_comment` doesn't work with shebangs
-    let start = re_crate_comment
-        .captures(input)
-        .ok_or_else(|| anyhow::format_err!("no doc-comment found"))?
-        .get(3)
-        .ok_or_else(|| anyhow::format_err!("no doc-comment found"))?
-        .start();
+    fn get_horizontal_trim(lines: &[&str], kind: CommentKind) -> Option<String> {
+        let mut i = usize::MAX;
+        let mut first = true;
 
-    let input = &input[start..];
+        // In case we have doc comments like `/**` or `/*!`, we want to remove stars if they are
+        // present. However, we first need to strip the empty lines so they don't get in the middle
+        // when we try to compute the "horizontal trim".
+        let lines = match kind {
+            CommentKind::Block => {
+                // Whatever happens, we skip the first line.
+                let mut i = lines
+                    .get(0)
+                    .map(|l| {
+                        if l.trim_start().starts_with('*') {
+                            0
+                        } else {
+                            1
+                        }
+                    })
+                    .unwrap_or(0);
+                let mut j = lines.len();
 
-    if let Some(input) = input.strip_prefix("/*!") {
-        // On every line:
-        //
-        // - update nesting level and detect end-of-comment
-        // - if margin is None:
-        //     - if there appears to be a margin, set margin.
-        // - strip off margin marker
-        // - update the leading space counter
-        // - strip leading space
-        // - append content
-        let mut r = String::new();
-
-        let mut leading_space = None;
-        let mut margin = None;
-        let mut depth: u32 = 1;
-
-        for line in input.lines() {
-            if depth == 0 {
-                break;
+                while i < j && lines[i].trim().is_empty() {
+                    i += 1;
+                }
+                while j > i && lines[j - 1].trim().is_empty() {
+                    j -= 1;
+                }
+                &lines[i..j]
             }
+            CommentKind::Line => lines,
+        };
 
-            // Update nesting and look for end-of-comment.
-            let mut end_of_comment = None;
-
-            for (end, marker) in re_nesting.find_iter(line).map(|m| (m.start(), m.as_str())) {
-                match (marker, depth) {
-                    ("/*", _) => depth += 1,
-                    ("*/", 1) => {
-                        end_of_comment = Some(end);
-                        depth = 0;
-                        break;
+        for line in lines {
+            for (j, c) in line.chars().enumerate() {
+                if j > i || !"* \t".contains(c) {
+                    return None;
+                }
+                if c == '*' {
+                    if first {
+                        i = j;
+                        first = false;
+                    } else if i != j {
+                        return None;
                     }
-                    ("*/", _) => depth -= 1,
-                    _ => panic!("got a comment marker other than /* or */"),
+                    break;
                 }
             }
+            if i >= line.len() {
+                return None;
+            }
+        }
+        if lines.is_empty() {
+            None
+        } else {
+            Some(lines[0][..i].into())
+        }
+    }
 
-            let line = end_of_comment.map(|end| &line[..end]).unwrap_or(line);
+    let data_s = data.as_str();
+    if data_s.contains('\n') {
+        let mut lines = data_s.lines().collect::<Vec<&str>>();
+        let mut changes = false;
+        let lines = if let Some((i, j)) = get_vertical_trim(&lines) {
+            changes = true;
+            // remove whitespace-only lines from the start/end of lines
+            &mut lines[i..j]
+        } else {
+            &mut lines
+        };
+        if let Some(horizontal) = get_horizontal_trim(lines, kind) {
+            changes = true;
+            // remove a "[ \t]*\*" block from each line, if possible
+            for line in lines.iter_mut() {
+                if let Some(tmp) = line.strip_prefix(&horizontal) {
+                    *line = tmp;
+                    if kind == CommentKind::Block
+                        && (*line == "*" || line.starts_with("* ") || line.starts_with("**"))
+                    {
+                        *line = &line[1..];
+                    }
+                }
+            }
+        }
+        if changes {
+            return lines.join("\n");
+        }
+    }
+    data
+}
 
-            // Detect and strip margin.
-            margin = margin.or_else(|| re_margin.find(line).map(|m| m.as_str()));
+/// Removes excess indentation on comments in order for the Markdown
+/// to be parsed correctly. This is necessary because the convention for
+/// writing documentation is to provide a space between the /// or //! marker
+/// and the doc text, but Markdown is whitespace-sensitive. For example,
+/// a block of text with four-space indentation is parsed as a code block,
+/// so if we didn't unindent comments, these list items
+///
+/// /// A list:
+/// ///
+/// ///    - Foo
+/// ///    - Bar
+///
+/// would be parsed as if they were in a code block, which is likely not what the user intended.
+///
+/// See also `rustc_resolve/rustdoc.rs`
+fn unindent_doc_fragments(docs: &mut [DocFragment]) {
+    // HACK: We can't tell the difference between `#[doc]` and doc-comments, so we can't specialize
+    // the indentation like rustodc does
+    let add = 0;
 
-            let line = if let Some(margin) = margin {
-                let end = line
-                    .char_indices()
-                    .take(margin.len())
-                    .map(|(i, c)| i + c.len_utf8())
-                    .last()
-                    .unwrap_or(0);
-                &line[end..]
-            } else {
-                line
-            };
+    // `min_indent` is used to know how much whitespaces from the start of each lines must be
+    // removed. Example:
+    //
+    // ```
+    // ///     hello!
+    // #[doc = "another"]
+    // ```
+    //
+    // In here, the `min_indent` is 1 (because non-sugared fragment are always counted with minimum
+    // 1 whitespace), meaning that "hello!" will be considered a codeblock because it starts with 4
+    // (5 - 1) whitespaces.
+    let Some(min_indent) = docs
+        .iter()
+        .map(|fragment| {
+            fragment.doc.as_str().lines().fold(usize::MAX, |min_indent, line| {
+                if line.chars().all(|c| c.is_whitespace()) {
+                    min_indent
+                } else {
+                    // Compare against either space or tab, ignoring whether they are
+                    // mixed or not.
+                    let whitespace = line.chars().take_while(|c| *c == ' ' || *c == '\t').count();
+                    min_indent.min(whitespace)
+                }
+            })
+        })
+        .min()
+    else {
+        return;
+    };
 
-            // Detect and strip leading indentation.
-            leading_space = leading_space.or_else(|| re_space.find(line).map(|m| m.end()));
-
-            // Make sure we have only leading spaces.
-            //
-            // If we see a tab, fall over.  I *would* expand them, but that gets into the question of how *many* spaces to expand them to, and *where* is the tab, because tabs are tab stops and not just N spaces.
-            n_leading_spaces(line, leading_space.unwrap_or(0))?;
-
-            let strip_len = line.len().min(leading_space.unwrap_or(0));
-            let line = &line[strip_len..];
-
-            // Done.
-            r.push_str(line);
-
-            // `lines` removes newlines.  Ideally, it wouldn't do that, but hopefully this shouldn't cause any *real* problems.
-            r.push('\n');
+    for fragment in docs {
+        if fragment.doc.is_empty() {
+            continue;
         }
 
-        Ok(r)
-    } else if input.starts_with("//!") || input.starts_with("///") {
-        let mut r = String::new();
+        let min_indent = if min_indent > 0 {
+            min_indent - add
+        } else {
+            min_indent
+        };
 
-        let mut leading_space = None;
+        fragment.indent = min_indent;
+    }
+}
 
-        for line in input.lines() {
-            // Strip leading comment marker.
-            let content = match re_comment.find(line) {
-                Some(m) => &line[m.end()..],
-                None => break,
-            };
-
-            // Detect and strip leading indentation.
-            leading_space = leading_space.or_else(|| {
-                re_space
-                    .captures(content)
-                    .and_then(|c| c.get(1))
-                    .map(|m| m.end())
-            });
-
-            // Make sure we have only leading spaces.
-            //
-            // If we see a tab, fall over.  I *would* expand them, but that gets into the question of how *many* spaces to expand them to, and *where* is the tab, because tabs are tab stops and not just N spaces.
-            n_leading_spaces(content, leading_space.unwrap_or(0))?;
-
-            let strip_len = content.len().min(leading_space.unwrap_or(0));
-            let content = &content[strip_len..];
-
-            // Done.
-            r.push_str(content);
-
-            // `lines` removes newlines.  Ideally, it wouldn't do that, but hopefully this shouldn't cause any *real* problems.
-            r.push('\n');
+/// The goal of this function is to apply the `DocFragment` transformation that is required when
+/// transforming into the final Markdown, which is applying the computed indent to each line in
+/// each doc fragment (a `DocFragment` can contain multiple lines in case of `#[doc = ""]`).
+///
+/// Note: remove the trailing newline where appropriate
+///
+/// See also `rustc_resolve/rustdoc.rs`
+fn add_doc_fragment(out: &mut String, frag: &DocFragment) {
+    let s = frag.doc.as_str();
+    let mut iter = s.lines();
+    if s.is_empty() {
+        out.push('\n');
+        return;
+    }
+    while let Some(line) = iter.next() {
+        if line.chars().any(|c| !c.is_whitespace()) {
+            assert!(line.len() >= frag.indent);
+            out.push_str(&line[frag.indent..]);
+        } else {
+            out.push_str(line);
         }
-
-        Ok(r)
-    } else {
-        Err(anyhow::format_err!("no doc-comment found"))
+        out.push('\n');
     }
 }
 
@@ -593,14 +708,12 @@ fn main() {}
     #[test]
     fn test_multiline_comment() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"/*!
 Here is a manifest:
@@ -620,14 +733,12 @@ fn main() {
     #[test]
     fn test_multiline_comment_shebang() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"#!/usr/bin/env cargo-eval
 
@@ -649,14 +760,12 @@ fn main() {
     #[test]
     fn test_multiline_block_comment() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"/*!
  * Here is a manifest:
@@ -674,14 +783,12 @@ fn main() {}
     #[test]
     fn test_multiline_block_comment_shebang() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"#!/usr/bin/env cargo-eval
 
@@ -697,6 +804,58 @@ fn main() {}
 "#),
         );
     }
+
+    #[test]
+    fn test_adjacent_comments() {
+        snapbox::assert_eq(
+            r#"Here is a manifest:
+
+```cargo
+[dependencies]
+time = "*"
+```
+"#,
+            ec!(r#"#!/usr/bin/env cargo-eval
+
+// I am a normal comment
+//! Here is a manifest:
+//!
+//! ```cargo
+//! [dependencies]
+//! time = "*"
+//! ```
+
+fn main () {
+}
+"#),
+        );
+    }
+
+    #[test]
+    fn test_doc_attrib() {
+        snapbox::assert_eq(
+            r#"Here is a manifest:
+
+```cargo
+[dependencies]
+time = "*"
+```
+"#,
+            ec!(r###"#!/usr/bin/env cargo-eval
+
+#![doc = r#"Here is a manifest:
+
+```cargo
+[dependencies]
+time = "*"
+```
+"#]
+
+fn main () {
+}
+"###),
+        );
+    }
 }
 
 /// Given a Cargo manifest, attempts to rewrite relative file paths to absolute ones, allowing the manifest to be relocated.