Fuzzing (#726)

* Add fuzzing support See fuzz/README.md for details. Example bug: #725 * improve fuzz docs * fuzzing: make relevant config part of input Most of the config is to do with what files to parse, however one setting - treat_doc_strings_as_comments - does impact parse_from_slice * fuzzing: improve docs + config clarity * fuzz/README.md: install instructions + another todo item
2024-10-01 05:23:37 +00:00 · 2021-05-09 19:05:14 +01:00 · 2021-05-09 19:05:14 +01:00 · 66967a1b9e
parent 9f4f457a3a
commit 66967a1b9e
10 changed files with 1308 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -18,6 +18,15 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "arbitrary"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "698b65a961a9d730fb45b6b0327e20207810c9f61ee421b082b27ba003f49e2b"
+dependencies = [
+ "derive_arbitrary",
+]
+
 [[package]]
 name = "arrayvec"
 version = "0.4.12"
@ -207,6 +216,17 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "derive_arbitrary"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df89dd0d075dea5cc5fdd6d5df6b8a61172a710b3efac1d6bdb9dd8b78f82c1a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "deunicode"
 version = "0.4.3"
@ -1073,6 +1093,7 @@ name = "tokei"
 version = "12.1.2"
 dependencies = [
 "aho-corasick",
+ "arbitrary",
 "clap",
 "crossbeam-channel",
 "dashmap",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -30,6 +30,7 @@ serde_json = "1.0.64"

 [dependencies]
 aho-corasick = "0.7.15"
+arbitrary = { version = "1.0.0", features = ["derive"] }
 clap = "2.33.3"
 crossbeam-channel = "0.5.0"
 encoding_rs_io = "0.1.7"
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@ -0,0 +1,4 @@
+
+target
+corpus
+artifacts
--- a/fuzz/Cargo.lock
+++ b/fuzz/Cargo.lock
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@ -0,0 +1,33 @@
+
+[package]
+name = "tokei-fuzz"
+version = "0.0.1"
+authors = ["Michael Macnair"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+arbitrary = { version = "1.0.0", features = ["derive"] }
+
+[dependencies.tokei]
+path = ".."
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "parse_from_slice_total"
+path = "fuzz_targets/parse_from_slice_total.rs"
+test = false
+doc = false
+
+[[bin]]
+name = "parse_from_slice_panic"
+path = "fuzz_targets/parse_from_slice_panic.rs"
+test = false
+doc = false
--- a/fuzz/README.md
+++ b/fuzz/README.md
@ -0,0 +1,30 @@
+## Fuzzing Tokei
+
+Tokei can be fuzzed using libFuzzer, via [cargo-fuzz](https://github.com/rust-fuzz/cargo-fuzz/).
+
+First install cargo-fuzz: `cargo install cargo-fuzz`.
+
+To launch a fuzzing job: `cargo +nightly fuzz run <target>` - it will run until you kill it with ctrl-c.
+
+To use multiple cores: `cargo +nightly fuzz run <target> --jobs=6`
+
+To speed things up (at the expensive of missing bugs that only manifest in larger files):
+`cargo +nightly fuzz run <target> -- -max_len=200`
+
+Available fuzz targets:
+
+- `parse_from_slice_panic` - checks that all of the LanguageType instances' `parse_from_slice` function doesn't panic.
+- `parse_from_slice_total` - checks that the language stats pass a basic test of reporting no more total lines than
+  there are new lines in the file. At the time of writing there are low-hanging bugs here.
+
+With the two `parse_from_slice` fuzz targets, it makes sense to share a common corpus directory as they have identical
+input formats, e.g.: `cargo +nightly fuzz run parse_from_slice_{panic,total} fuzz/corpus/common`
+
+Potential improvements:
+
+- Build the fuzz harnesses in CI, so they don't rot.
+- Do some coverage analysis to check if we're missing any code we would benefit from fuzzing (once it's
+  [integrated into cargo-fuzz](https://github.com/rust-fuzz/cargo-fuzz/pull/248))
+- Tighten the `parse_from_slice_total` fuzz target to check the total lines exactly matches the number of lines in the
+  file. Only once any bugs found with the current fuzzer are fixed.
+- Check in a minimized corpus, and run regression over it in CI.
--- a/fuzz/fuzz_targets/parse_from_slice.rs
+++ b/fuzz/fuzz_targets/parse_from_slice.rs
@ -0,0 +1,51 @@
+use arbitrary::Arbitrary;
+use std::str;
+
+use tokei::{Config, LanguageType};
+
+#[derive(Arbitrary, Debug)]
+pub struct FuzzInput<'a> {
+    lang: LanguageType,
+    treat_doc_strings_as_comments: bool,
+    data: &'a [u8],
+}
+
+// The first byte of data is used to select a language; remaining input is parsed
+// If check_total is true, asserts that the parsed stats pass a basic sanity test
+pub fn parse_from_slice(input: FuzzInput, check_total: bool) {
+    let config = &Config {
+        treat_doc_strings_as_comments: Some(input.treat_doc_strings_as_comments),
+
+        // these options don't impact the behaviour of parse_from_slice:
+        columns: None,
+        hidden: None,
+        no_ignore: None,
+        no_ignore_parent: None,
+        no_ignore_dot: None,
+        no_ignore_vcs: None,
+        sort: None,
+        types: None,
+    };
+
+    // check that parsing doesn't panic
+    let stats = input.lang.parse_from_slice(input.data, config);
+
+    if check_total {
+        // verify that the parsed total lines is not more than the total occurences of \n and \r\n.
+        // if/when all of the current discrepancies are fixed, we could make this stronger by checking it is equal.
+        if let Ok(s) = str::from_utf8(input.data) {
+            assert!(
+            stats.lines() <= s.lines().count(),
+            "{} got more total lines ({}) than str::lines ({}). Code: {}, Comments: {}, Blanks: {}. treat_doc_strings_as_comments: {}. File contents (as UTF-8):\n{}",
+            input.lang.name(),
+            stats.lines(),
+            s.lines().count(),
+            stats.code,
+            stats.comments,
+            input.treat_doc_strings_as_comments,
+            stats.blanks,
+            s
+        )
+        };
+    }
+}
--- a/fuzz/fuzz_targets/parse_from_slice_panic.rs
+++ b/fuzz/fuzz_targets/parse_from_slice_panic.rs
@ -0,0 +1,9 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+
+mod parse_from_slice;
+use parse_from_slice::{parse_from_slice, FuzzInput};
+
+fuzz_target!(|data: FuzzInput| {
+    parse_from_slice(data, false);
+});
--- a/fuzz/fuzz_targets/parse_from_slice_total.rs
+++ b/fuzz/fuzz_targets/parse_from_slice_total.rs
@ -0,0 +1,9 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+
+mod parse_from_slice;
+use parse_from_slice::{parse_from_slice, FuzzInput};
+
+fuzz_target!(|data: FuzzInput| {
+    parse_from_slice(data, true);
+});
--- a/src/language/language_type.tera.rs
+++ b/src/language/language_type.tera.rs
@ -1,9 +1,11 @@
+use arbitrary::Arbitrary;
+
 /// Represents a individual programming language. Can be used to provide
 /// information about the language, such as multi line comments, single line
 /// comments, string literal syntax, whether a given language allows nesting
 /// comments.
 #[derive(Deserialize, Serialize)]
-#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Arbitrary, Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 #[non_exhaustive]
 pub enum LanguageType {
    {% for key, _ in languages -%}