refactor SyntaxCounter to be concurrent, and share language information.

This commit is contained in:
Erin Power 2020-03-20 18:22:48 +01:00
parent 2e12cf6faa
commit bdd1431a2c
8 changed files with 275 additions and 105 deletions

9
Cargo.lock generated
View file

@ -258,6 +258,7 @@ dependencies = [
"ahash",
"cfg-if",
"num_cpus",
"serde",
]
[[package]]
@ -901,6 +902,9 @@ name = "serde"
version = "1.0.105"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e707fbbf255b8fc8c3b99abb91e7257a622caeb20a9818cbadbeeede4e0932ff"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_cbor"
@ -914,9 +918,9 @@ dependencies = [
[[package]]
name = "serde_derive"
version = "1.0.104"
version = "1.0.105"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "128f9e303a5a29922045a830221b8f78ec74a5f544944f3d5984f8ec3895ef64"
checksum = "ac5d00fc561ba2724df6758a17de23df5914f20e41cb00f94d5b7ae42fffaff8"
dependencies = [
"proc-macro2",
"quote",
@ -1056,7 +1060,6 @@ dependencies = [
"regex",
"serde",
"serde_cbor",
"serde_derive",
"serde_json",
"serde_yaml",
"tempfile",

View file

@ -45,12 +45,12 @@ grep-searcher = "0.1.7"
ignore = "0.4.12"
log = "0.4.6"
rayon = "1.3.0"
serde = "1.0.105"
serde_derive = "1.0.104"
serde = { version = "1.0.105", features = ["derive", "rc"] }
term_size = "0.3.1"
toml = "0.5.6"
parking_lot = "0.10.0"
dashmap = "3.7.0"
dashmap = { version = "3.7.0", features = ["serde"] }
lazy_static = "1.4.0"
[dependencies.env_logger]
features = []

View file

@ -45,6 +45,6 @@ if [ $FULL = true ]; then
"loc $input" \
"cloc $input"
else
hyperfine -w 5 "target/release/tokei $input" \
hyperfine -w 10 -m 50 "target/release/tokei $input" \
"tokei $input"
fi

View file

@ -195,6 +195,162 @@ impl LanguageType {
}
}
pub(crate) fn start_any_comments(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{~@key}} =>
&[
{{#if this.line_comment}}
{{~#each this.line_comment}}
"{{this}}",
{{~/each}}
{{~/if}}
{{#if this.multi_line_comments}}
{{~#each this.multi_line_comments}}
"{{this.0}}",
{{~/each}}
{{~/if}}
{{#if this.nested_comments}}
{{~#each this.nested_comments}}
"{{this.0}}",
{{~/each}}
{{~/if}}
],
{{~/each}}
}
}
pub(crate) fn _start_string_literals(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.quotes}}
{{~@key}} =>
&[
{{~#each this.quotes}}
"{{this.0}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
pub(crate) fn _end_string_literals(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.quotes}}
{{~@key}} =>
&[
{{~#each this.quotes}}
"{{this.1}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
pub(crate) fn _start_multi_line_comments(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.multi_line_comments}}
{{~@key}} =>
&[
{{~#each this.multi_line_comments}}
"{{this.0}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
pub(crate) fn _end_multi_line_comments(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.multi_line_comments}}
{{~@key}} =>
&[
{{~#each this.multi_line_comments}}
"{{this.1}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
pub(crate) fn _start_nested_comments(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.nested_comments}}
{{~@key}} =>
&[
{{~#each this.nested_comments}}
"{{this.0}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
pub(crate) fn _end_nested_comments(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.nested_comments}}
{{~@key}} =>
&[
{{~#each this.nested_comments}}
"{{this.1}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
/// Returns the starting doc literal.
pub(crate) fn _start_doc_quotes(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.doc_quotes}}
{{~@key}} =>
&[
{{~#each this.doc_quotes}}
"{{this.0}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
/// Returns the starting doc literal.
pub(crate) fn _end_doc_quotes(self) -> &'static [&'static str] {
match self {
{{#each languages}}
{{#if this.doc_quotes}}
{{~@key}} =>
&[
{{~#each this.doc_quotes}}
"{{this.1}}",
{{~/each}}
],
{{~/if}}
{{~/each}}
_ => &[],
}
}
/// Returns the parts of syntax that determines whether tokei can skip large
/// parts of analysis.
pub fn important_syntax(self) -> &'static [&'static str] {

View file

@ -5,7 +5,6 @@ use std::{
io::{self, BufRead, BufReader, Read},
path::{Path, PathBuf},
str::FromStr,
sync::Arc,
};
use crate::{
@ -15,7 +14,6 @@ use crate::{
utils::{ext::SliceExt, fs as fsutils},
};
use aho_corasick::AhoCorasick;
use encoding_rs_io::DecodeReaderBytesBuilder;
use grep_searcher::LineIter;
@ -26,12 +24,7 @@ include!(concat!(env!("OUT_DIR"), "/language_type.rs"));
impl LanguageType {
/// Parses a given `Path` using the `LanguageType`. Returning `Stats`
/// on success and giving back ownership of PathBuf on error.
pub fn parse(
self,
path: PathBuf,
config: &Config,
matchers: Arc<(AhoCorasick, AhoCorasick)>,
) -> Result<Stats, (io::Error, PathBuf)> {
pub fn parse(self, path: PathBuf, config: &Config) -> Result<Stats, (io::Error, PathBuf)> {
let text = {
let f = match File::open(&path) {
Ok(f) => f,
@ -46,18 +39,12 @@ impl LanguageType {
s
};
Ok(self.parse_from_slice(path, &text, config, matchers))
Ok(self.parse_from_slice(path, &text, config))
}
/// Parses the text provided. Returns `Stats` on success.
pub fn parse_from_str<A: AsRef<str>>(
self,
path: PathBuf,
text: A,
config: &Config,
matchers: Arc<(AhoCorasick, AhoCorasick)>,
) -> Stats {
self.parse_from_slice(path, text.as_ref().as_bytes(), config, matchers)
pub fn parse_from_str<A: AsRef<str>>(self, path: PathBuf, text: A, config: &Config) -> Stats {
self.parse_from_slice(path, text.as_ref().as_bytes(), config)
}
/// Parses the text provided. Returning `Stats` on success.
@ -66,7 +53,6 @@ impl LanguageType {
path: PathBuf,
text: A,
config: &Config,
matchers: Arc<(AhoCorasick, AhoCorasick)>,
) -> Stats {
let lines = LineIter::new(b'\n', text.as_ref());
let mut stats = Stats::new(path);
@ -77,46 +63,42 @@ impl LanguageType {
stats.code = count;
stats
} else {
self.parse_lines(config, lines, stats, matchers)
self.parse_lines(config, lines, stats)
}
}
pub(crate) fn create_matchers(self) -> (AhoCorasick, AhoCorasick) {
(
aho_corasick::AhoCorasick::new_auto_configured(self.important_syntax()),
aho_corasick::AhoCorasick::new_auto_configured(self.line_comments()),
)
}
#[inline]
fn parse_lines<'a>(
self,
config: &Config,
lines: impl IntoIterator<Item = &'a [u8]>,
mut stats: Stats,
matchers: Arc<(AhoCorasick, AhoCorasick)>,
) -> Stats {
let mut syntax = SyntaxCounter::new(self);
let (ref matcher, ref single_comment) = *matchers;
for line in lines {
// FORTRAN has a rule where it only counts as a comment if it's the
// first character in the column, so removing starting whitespace
// could cause a miscount.
let line = if syntax.is_fortran { line } else { line.trim() };
let line = if syntax.shared.is_fortran {
line
} else {
line.trim()
};
trace!("{}", String::from_utf8_lossy(line));
if line.trim().is_empty() {
stats.blanks += 1;
trace!("Blank No.{}", stats.blanks);
continue;
} else if syntax.is_plain_mode() && !matcher.is_match(line) {
} else if syntax.is_plain_mode() && !syntax.shared.important_syntax.is_match(line) {
trace!("^ Skippable");
if single_comment
.earliest_find(line)
.map(|m| m.start() == 0)
.unwrap_or(false)
if syntax
.shared
.line_comments
.iter()
.any(|c| line.starts_with(c.as_bytes()))
{
stats.comments += 1;
trace!("Comment No.{}", stats.comments);
@ -180,15 +162,17 @@ impl LanguageType {
// If we're currently in a comment or we just ended
// with one.
syntax
.start_of_comments()
.any(|comment| line.starts_with(comment.as_bytes()))
.shared
.any_comments
.earliest_find(line)
.map_or(false, |e| e.start() == 0)
&& syntax.quote.is_none()
)
|| ((
// If we're currently in a doc string or we just ended
// with one.
syntax.quote.is_some() ||
syntax.doc_quotes.iter().any(|(s, _)| line.starts_with(s.as_bytes()))
syntax.shared.doc_quotes.iter().any(|(s, _)| line.starts_with(s.as_bytes()))
) &&
// `Some(true)` is import in order to respect the current
// configuration.

View file

@ -1,3 +1,7 @@
use std::sync::Arc;
use aho_corasick::AhoCorasick;
use dashmap::DashMap;
use log::Level::Trace;
use super::language_type::LanguageType;
@ -15,53 +19,77 @@ use super::language_type::LanguageType;
/// `comment` mode.
#[derive(Clone, Debug)]
pub(crate) struct SyntaxCounter {
pub(crate) allows_nested: bool,
pub(crate) doc_quotes: &'static [(&'static str, &'static str)],
pub(crate) is_fortran: bool,
pub(crate) line_comments: &'static [&'static str],
pub(crate) multi_line_comments: &'static [(&'static str, &'static str)],
pub(crate) nested_comments: &'static [(&'static str, &'static str)],
pub(crate) shared: Arc<SharedMatchers>,
pub(crate) quote: Option<&'static str>,
pub(crate) quote_is_doc_quote: bool,
pub(crate) quotes: &'static [(&'static str, &'static str)],
pub(crate) stack: Vec<&'static str>,
}
#[derive(Clone, Debug)]
pub(crate) struct SharedMatchers {
pub allows_nested: bool,
pub doc_quotes: &'static [(&'static str, &'static str)],
pub important_syntax: AhoCorasick,
pub any_comments: AhoCorasick,
pub is_fortran: bool,
pub line_comments: &'static [&'static str],
pub multi_line_comments: &'static [(&'static str, &'static str)],
pub nested_comments: &'static [(&'static str, &'static str)],
pub string_literals: &'static [(&'static str, &'static str)],
}
impl SharedMatchers {
pub fn new(language: LanguageType) -> Arc<Self> {
lazy_static::lazy_static! {
pub(crate) static ref MATCHERS: DashMap<LanguageType, Arc<SharedMatchers>> = DashMap::new();
}
MATCHERS
.entry(language)
.or_insert_with(|| Arc::new(Self::init(language)))
.value()
.clone()
}
pub fn init(language: LanguageType) -> Self {
Self {
allows_nested: language.allows_nested(),
doc_quotes: language.doc_quotes(),
is_fortran: language.is_fortran(),
important_syntax: AhoCorasick::new_auto_configured(language.important_syntax()),
any_comments: AhoCorasick::new_auto_configured(language.start_any_comments()),
line_comments: language.line_comments(),
multi_line_comments: language.multi_line_comments(),
nested_comments: language.nested_comments(),
string_literals: language.quotes(),
}
}
}
impl SyntaxCounter {
pub(crate) fn new(language: LanguageType) -> Self {
Self {
allows_nested: language.allows_nested(),
doc_quotes: language.doc_quotes(),
is_fortran: language.is_fortran(),
line_comments: language.line_comments(),
multi_line_comments: language.multi_line_comments(),
nested_comments: language.nested_comments(),
shared: SharedMatchers::new(language),
quote_is_doc_quote: false,
quotes: language.quotes(),
stack: Vec::with_capacity(1),
quote: None,
}
}
#[inline]
pub(crate) fn start_of_comments(&self) -> impl Iterator<Item = &&str> {
self.line_comments
.iter()
.chain(self.multi_line_comments.iter().map(|(s, _)| s))
.chain(self.nested_comments.iter().map(|(s, _)| s))
}
#[inline]
pub(crate) fn parse_line_comment(&self, window: &[u8]) -> bool {
if self.quote.is_some() || !self.stack.is_empty() {
return false;
}
for comment in self.line_comments {
if window.starts_with(comment.as_bytes()) {
trace!("Start {:?}", comment);
return true;
}
if let Some(comment) = self
.shared
.line_comments
.iter()
.find(|c| window.starts_with(c.as_bytes()))
{
trace!("Start {:?}", comment);
return true;
}
false
@ -73,22 +101,28 @@ impl SyntaxCounter {
return None;
}
for &(start, end) in self.doc_quotes {
if window.starts_with(start.as_bytes()) {
trace!("Start Doc {:?}", start);
self.quote = Some(end);
self.quote_is_doc_quote = true;
return Some(start.len());
}
if let Some((start, end)) = self
.shared
.doc_quotes
.iter()
.find(|(s, _)| window.starts_with(s.as_bytes()))
{
trace!("Start Doc {:?}", start);
self.quote = Some(end);
self.quote_is_doc_quote = true;
return Some(start.len());
}
for &(start, end) in self.quotes {
if window.starts_with(start.as_bytes()) {
trace!("Start {:?}", start);
self.quote = Some(end);
self.quote_is_doc_quote = false;
return Some(start.len());
}
if let Some((start, end)) = self
.shared
.string_literals
.iter()
.find(|(s, _)| window.starts_with(s.as_bytes()))
{
trace!("Start {:?}", start);
self.quote = Some(end);
self.quote_is_doc_quote = false;
return Some(start.len());
}
None
@ -115,16 +149,20 @@ impl SyntaxCounter {
return None;
}
let iter = self.multi_line_comments.iter().chain(self.nested_comments);
let iter = self
.shared
.multi_line_comments
.iter()
.chain(self.shared.nested_comments);
for &(start, end) in iter {
if window.starts_with(start.as_bytes()) {
if self.stack.is_empty()
|| self.allows_nested
|| self.nested_comments.contains(&(start, end))
|| self.shared.allows_nested
|| self.shared.nested_comments.contains(&(start, end))
{
self.stack.push(end);
if log_enabled!(Trace) && self.allows_nested {
if log_enabled!(Trace) && self.shared.allows_nested {
trace!("Start nested {:?}", start);
} else {
trace!("Start {:?}", start);

View file

@ -45,7 +45,7 @@
#[macro_use]
extern crate log;
#[macro_use]
extern crate serde_derive;
extern crate serde;
#[macro_use]
mod utils;

View file

@ -1,4 +1,4 @@
use std::{collections::BTreeMap, path::Path, sync::Arc};
use std::{collections::BTreeMap, path::Path};
use ignore::{overrides::OverrideBuilder, WalkBuilder, WalkState::Continue};
@ -78,10 +78,8 @@ pub fn get_all_files<A: AsRef<Path>>(
}
};
if let Some(file_type) = entry.file_type() {
if file_type.is_file() {
tx.send(entry).unwrap();
}
if entry.file_type().map_or(false, |ft| ft.is_file()) {
tx.send(entry).unwrap();
}
Continue
@ -89,10 +87,6 @@ pub fn get_all_files<A: AsRef<Path>>(
});
let types: Option<&[LanguageType]> = config.types.as_ref().map(|v| &**v);
let matchers = dashmap::DashMap::<
LanguageType,
Arc<(aho_corasick::AhoCorasick, aho_corasick::AhoCorasick)>,
>::new();
let iter = rx
.into_iter()
@ -100,13 +94,8 @@ pub fn get_all_files<A: AsRef<Path>>(
.filter_map(|e| LanguageType::from_path(e.path(), &config).map(|l| (e, l)))
.filter(|(_, l)| types.map(|t| t.contains(l)).unwrap_or(true))
.map(|(entry, language)| {
let matchers = matchers
.entry(language)
.or_insert_with(|| Arc::new(language.create_matchers()))
.clone();
language
.parse(entry.into_path(), &config, matchers)
.parse(entry.into_path(), &config)
.map(|stats| (language, Some(stats)))
.unwrap_or_else(|(e, path)| {
error!("Error reading {}:\n{}", path.display(), e);