pub mod auto_source_detection;
pub mod detect_sources;
pub mod init_tracing;
pub mod sources;
use crate::extractor::{Extracted, Extractor};
use crate::glob::optimize_patterns;
use crate::scanner::detect_sources::resolve_globs;
use crate::scanner::sources::{
public_source_entries_to_private_source_entries, PublicSourceEntry, SourceEntry, Sources,
};
use crate::GlobEntry;
use auto_source_detection::BINARY_EXTENSIONS_GLOB;
use bstr::ByteSlice;
use fast_glob::glob_match;
use fxhash::{FxHashMap, FxHashSet};
use ignore::{gitignore::GitignoreBuilder, WalkBuilder};
use init_tracing::{init_tracing, SHOULD_TRACE};
use rayon::prelude::*;
use std::collections::{BTreeMap, BTreeSet};
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use std::time::SystemTime;
use tracing::event;
#[derive(Debug, Clone)]
pub enum ChangedContent {
File(PathBuf, String),
Content(String, String),
}
#[derive(Debug, Clone)]
pub struct ScanOptions {
pub base: Option<String>,
pub sources: Vec<GlobEntry>,
}
#[derive(Debug, Clone)]
pub struct ScanResult {
pub candidates: Vec<String>,
pub files: Vec<String>,
pub globs: Vec<GlobEntry>,
}
#[derive(Debug, Clone, Default)]
pub struct Scanner {
sources: Sources,
walker: Option<WalkBuilder>,
extensions: FxHashSet<String>,
files: FxHashSet<PathBuf>,
dirs: FxHashSet<PathBuf>,
globs: Option<Vec<GlobEntry>>,
candidates: FxHashSet<String>,
mtimes: FxHashMap<PathBuf, SystemTime>,
has_scanned_once: bool,
sources_scanned: bool,
}
impl Scanner {
pub fn new(sources: Vec<PublicSourceEntry>) -> Self {
init_tracing();
if *SHOULD_TRACE {
event!(tracing::Level::INFO, "Provided sources:");
for source in &sources {
event!(tracing::Level::INFO, "Source: {:?}", source);
}
}
let sources = Sources::new(public_source_entries_to_private_source_entries(sources));
if *SHOULD_TRACE {
event!(tracing::Level::INFO, "Optimized sources:");
for source in sources.iter() {
event!(tracing::Level::INFO, "Source: {:?}", source);
}
}
let walker = create_walker(&sources);
Self {
sources,
walker,
..Default::default()
}
}
pub fn scan(&mut self) -> Vec<String> {
self.sources_scanned = false;
let (scanned_blobs, css_files) = self.discover_sources();
self.extract_candidates(scanned_blobs, css_files);
let mut result = self.candidates.iter().cloned().collect::<Vec<_>>();
result.par_sort_unstable();
result
}
#[tracing::instrument(skip_all)]
pub fn scan_content(&mut self, changed_content: Vec<ChangedContent>) -> Vec<String> {
let (changed_files, changed_contents) =
changed_content
.into_iter()
.partition::<Vec<_>, _>(|x| match x {
ChangedContent::File(_, _) => true,
ChangedContent::Content(_, _) => false,
});
let mut content_to_scan: Vec<ChangedContent> = changed_contents;
let changed_files = changed_files
.into_iter()
.filter_map(|changed_content| match changed_content {
ChangedContent::File(file, extension) => {
let Ok(file) = dunce::canonicalize(file) else {
return None;
};
Some(ChangedContent::File(file, extension))
}
_ => unreachable!(),
})
.collect::<Vec<_>>();
let (known_files, mut new_unknown_files) = changed_files
.into_iter()
.partition::<Vec<_>, _>(|changed_file| match changed_file {
ChangedContent::Content(_, _) => unreachable!(),
ChangedContent::File(file, _) => self.files.contains(file),
});
content_to_scan.extend(known_files);
if !new_unknown_files.is_empty() {
if let Some(walk_builder) = &mut self.walker {
for entry in walk_builder.build().filter_map(Result::ok) {
let path = entry.path();
if !path.is_file() {
continue;
}
let mut drop_file_indexes = vec![];
for (idx, changed_file) in new_unknown_files.iter().enumerate().rev() {
let ChangedContent::File(file, _) = changed_file else {
continue;
};
if file == path {
self.files.insert(path.to_path_buf());
content_to_scan.push(changed_file.clone());
drop_file_indexes.push(idx);
}
}
if !drop_file_indexes.is_empty() {
drop_file_indexes.into_iter().for_each(|idx| {
new_unknown_files.remove(idx);
});
}
if new_unknown_files.is_empty() {
break;
}
}
}
}
let blobs = read_all_files(content_to_scan);
self.extract_candidates(blobs, vec![])
}
#[tracing::instrument(skip_all)]
fn extract_candidates(&mut self, blobs: Vec<Vec<u8>>, css_files: Vec<PathBuf>) -> Vec<String> {
let mut new_candidates = parse_all_blobs(blobs);
if !css_files.is_empty() {
let css_variables = extract_css_variables(read_all_files(
css_files
.into_iter()
.map(|file| ChangedContent::File(file, "css".into()))
.collect(),
));
new_candidates.extend(css_variables);
}
for existing in self.candidates.iter() {
new_candidates.remove(existing);
}
self.candidates.extend(new_candidates.iter().cloned());
let mut result: Vec<String> = new_candidates.into_iter().collect();
result.par_sort_unstable();
result
}
#[tracing::instrument(skip_all)]
pub fn get_files(&mut self) -> Vec<String> {
let _ = self.discover_sources();
self.files
.par_iter()
.filter_map(|x| x.clone().into_os_string().into_string().ok())
.collect()
}
#[tracing::instrument(skip_all)]
pub fn get_globs(&mut self) -> Vec<GlobEntry> {
if let Some(globs) = &self.globs {
return globs.clone();
}
let _ = self.discover_sources();
let mut globs = vec![];
for source in self.sources.iter() {
match source {
SourceEntry::Auto { base } | SourceEntry::External { base } => {
globs.extend(resolve_globs(
base.to_path_buf(),
&self.dirs,
&self.extensions,
));
}
SourceEntry::Pattern { base, pattern } => {
globs.push(GlobEntry {
base: base.to_string_lossy().to_string(),
pattern: pattern.to_string(),
});
}
_ => {}
}
}
globs = optimize_patterns(&globs);
self.globs = Some(globs.clone());
globs
}
#[tracing::instrument(skip_all)]
pub fn get_normalized_sources(&self) -> Vec<GlobEntry> {
self.sources
.iter()
.filter_map(|source| match source {
SourceEntry::Auto { base } | SourceEntry::External { base } => Some(GlobEntry {
base: base.to_string_lossy().to_string(),
pattern: "**/*".to_string(),
}),
SourceEntry::Pattern { base, pattern } => Some(GlobEntry {
base: base.to_string_lossy().to_string(),
pattern: pattern.to_string(),
}),
_ => None,
})
.collect()
}
#[tracing::instrument(skip_all)]
pub fn get_candidates_with_positions(
&mut self,
changed_content: ChangedContent,
) -> Vec<(String, usize)> {
let content = read_changed_content(changed_content).unwrap_or_default();
let original_content = &content;
let content = content.replace("-[]", "XYZ");
let offset = content.as_ptr() as usize;
let mut extractor = Extractor::new(&content[..]);
extractor
.extract()
.into_par_iter()
.flat_map(|extracted| match extracted {
Extracted::Candidate(s) => {
let i = s.as_ptr() as usize - offset;
let original = &original_content[i..i + s.len()];
if original.contains_str("-[]") {
return Some(unsafe {
(String::from_utf8_unchecked(original.to_vec()), i)
});
}
Some(unsafe { (String::from_utf8_unchecked(s.to_vec()), i) })
}
_ => None,
})
.collect()
}
#[tracing::instrument(skip_all)]
fn discover_sources(&mut self) -> (Vec<Vec<u8>>, Vec<PathBuf>) {
if self.sources_scanned {
return (vec![], vec![]);
}
self.sources_scanned = true;
let Some(walker) = &mut self.walker else {
return (vec![], vec![]);
};
let all_entries = if self.has_scanned_once {
walk_parallel(walker)
} else {
walk_synchronous(walker)
};
let mut css_files: Vec<PathBuf> = vec![];
let mut content_paths: Vec<(PathBuf, String)> = Vec::new();
let mut seen_files: FxHashSet<PathBuf> = FxHashSet::default();
for (path, is_dir, extension) in all_entries {
if is_dir {
self.dirs.insert(path);
} else {
if !seen_files.insert(path.clone()) {
continue;
}
let changed = if self.has_scanned_once {
let current_mtime = std::fs::metadata(&path)
.ok()
.and_then(|m| m.modified().ok());
match current_mtime {
Some(mtime) => {
let prev = self.mtimes.insert(path.clone(), mtime);
prev.is_none_or(|prev| prev != mtime)
}
None => true,
}
} else {
true
};
match extension.as_str() {
"css" => {
if changed {
css_files.push(path.clone());
}
}
_ => {
if changed {
content_paths.push((path.clone(), extension.clone()));
}
}
}
self.extensions.insert(extension);
self.files.insert(path);
}
}
let scanned_blobs: Vec<Vec<u8>> = content_paths
.into_par_iter()
.filter_map(|(path, ext)| {
let content = std::fs::read(&path).ok()?;
event!(tracing::Level::INFO, "Reading {:?}", path);
let processed = pre_process_input(content, &ext);
if processed.is_empty() {
None
} else {
Some(processed)
}
})
.collect();
if !self.has_scanned_once {
self.has_scanned_once = true;
}
(scanned_blobs, css_files)
}
}
fn read_changed_content(c: ChangedContent) -> Option<Vec<u8>> {
let (content, extension) = match c {
ChangedContent::File(file, extension) => match std::fs::read(&file) {
Ok(content) => {
event!(tracing::Level::INFO, "Reading {:?}", file);
(content, extension)
}
Err(e) => {
event!(tracing::Level::ERROR, "Failed to read file: {:?}", e);
return None;
}
},
ChangedContent::Content(contents, extension) => (contents.into_bytes(), extension),
};
Some(pre_process_input(content, &extension))
}
pub fn pre_process_input(content: Vec<u8>, extension: &str) -> Vec<u8> {
use crate::extractor::pre_processors::*;
match extension {
"clj" | "cljs" | "cljc" => Clojure.process(&content),
"heex" | "eex" | "ex" | "exs" => Elixir.process(&content),
"cshtml" | "razor" => Razor.process(&content),
"haml" => Haml.process(&content),
"json" | "jsonl" | "ndjson" => Json.process(&content),
"md" | "mdx" => Markdown.process(&content),
"pug" => Pug.process(&content),
"rb" | "erb" => Ruby.process(&content),
"slim" | "slang" => Slim.process(&content),
"svelte" => Svelte.process(&content),
"rs" => Rust.process(&content),
"vue" => Vue.process(&content),
_ => content,
}
}
#[tracing::instrument(skip_all)]
fn read_all_files(changed_content: Vec<ChangedContent>) -> Vec<Vec<u8>> {
event!(
tracing::Level::INFO,
"Reading {:?} file(s)",
changed_content.len()
);
changed_content
.into_par_iter()
.filter_map(read_changed_content)
.collect()
}
#[tracing::instrument(skip_all)]
fn extract_css_variables(blobs: Vec<Vec<u8>>) -> FxHashSet<String> {
extract(blobs, |mut extractor| {
extractor.extract_variables_from_css()
})
}
#[tracing::instrument(skip_all)]
fn parse_all_blobs(blobs: Vec<Vec<u8>>) -> FxHashSet<String> {
extract(blobs, |mut extractor| extractor.extract())
}
#[tracing::instrument(skip_all)]
fn extract<H>(blobs: Vec<Vec<u8>>, handle: H) -> FxHashSet<String>
where
H: Fn(Extractor) -> Vec<Extracted> + std::marker::Sync,
{
blobs
.par_iter()
.flat_map(|blob| blob.par_split(|x| *x == b'\n'))
.filter_map(|blob| {
if blob.is_empty() {
return None;
}
let extracted = handle(crate::extractor::Extractor::new(blob));
if extracted.is_empty() {
return None;
}
Some(FxHashSet::from_iter(extracted.into_iter().map(
|x| match x {
Extracted::Candidate(bytes) => bytes,
Extracted::CssVariable(bytes) => bytes,
},
)))
})
.reduce(Default::default, |mut a, b| {
a.extend(b);
a
})
.into_iter()
.map(|s| unsafe { String::from_utf8_unchecked(s.to_vec()) })
.collect()
}
type WalkEntry = (PathBuf, bool, String);
#[tracing::instrument(skip_all)]
fn walk_synchronous(walker: &mut WalkBuilder) -> Vec<WalkEntry> {
let mut entries = Vec::new();
for entry in walker.build().filter_map(Result::ok) {
let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
let path = entry.into_path();
if is_dir {
entries.push((path, true, String::new()));
} else {
let ext = path
.extension()
.and_then(|x| x.to_str())
.unwrap_or_default()
.to_owned();
entries.push((path, false, ext));
}
}
entries
}
#[tracing::instrument(skip_all)]
fn walk_parallel(walker: &mut WalkBuilder) -> Vec<WalkEntry> {
struct FlushOnDrop {
local: Vec<WalkEntry>,
shared: Arc<Mutex<Vec<WalkEntry>>>,
}
impl Drop for FlushOnDrop {
fn drop(&mut self) {
if !self.local.is_empty() {
self.shared.lock().unwrap().append(&mut self.local);
}
}
}
let collected: Arc<Mutex<Vec<WalkEntry>>> = Arc::new(Mutex::new(Vec::new()));
walker.build_parallel().run(|| {
let mut buf = FlushOnDrop {
local: Vec::with_capacity(256),
shared: collected.clone(),
};
Box::new(move |entry| {
let Ok(entry) = entry else {
return ignore::WalkState::Continue;
};
let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
let path = entry.into_path();
if is_dir {
buf.local.push((path, true, String::new()));
} else {
let ext = path
.extension()
.and_then(|x| x.to_str())
.unwrap_or_default()
.to_owned();
buf.local.push((path, false, ext));
}
if buf.local.len() >= 256 {
buf.shared.lock().unwrap().append(&mut buf.local);
}
ignore::WalkState::Continue
})
});
Arc::try_unwrap(collected).unwrap().into_inner().unwrap()
}
fn create_walker(sources: &Sources) -> Option<WalkBuilder> {
let mut other_roots: FxHashSet<&PathBuf> = FxHashSet::default();
let mut first_root: Option<&PathBuf> = None;
let mut ignores: BTreeMap<&PathBuf, BTreeSet<String>> = Default::default();
for source in sources.iter() {
match source {
SourceEntry::Auto { base } => {
if first_root.is_none() {
first_root = Some(base);
} else {
other_roots.insert(base);
}
}
SourceEntry::Pattern { base, pattern } => {
let mut pattern = pattern.to_string();
if first_root.is_none() {
first_root = Some(base);
} else {
other_roots.insert(base);
}
if !pattern.contains("**") {
if !pattern.starts_with("/") {
pattern = format!("/{pattern}");
}
ignores
.entry(base)
.or_default()
.insert(format!("!{}", pattern));
} else {
if let Some(extension) = Path::new(&pattern).extension() {
ignores
.entry(base)
.or_default()
.insert(format!("!*.{}", extension.to_string_lossy()));
}
}
}
SourceEntry::Ignored { base, pattern } => {
let mut pattern = pattern.to_string();
if !pattern.starts_with("/") {
pattern = format!("/{pattern}");
}
ignores.entry(base).or_default().insert(pattern);
}
SourceEntry::External { base } => {
if first_root.is_none() {
first_root = Some(base);
} else {
other_roots.insert(base);
}
ignores
.entry(base)
.or_default()
.insert(format!("!{}", "/**/*"));
ignores
.entry(base)
.or_default()
.insert(BINARY_EXTENSIONS_GLOB.clone());
}
}
}
let mut builder = WalkBuilder::new(first_root?);
builder.follow_links(true);
builder.hidden(false);
builder.git_global(false);
builder.require_git(false);
for parent in first_root?.ancestors() {
if parent.join(".git").exists() {
builder.require_git(true);
break;
}
}
for root in other_roots {
builder.add(root);
}
for ignore in auto_source_detection::RULES.iter() {
builder.add_gitignore(ignore.clone());
}
for (base, patterns) in ignores {
let mut ignore_builder = GitignoreBuilder::new(base);
for pattern in patterns {
ignore_builder.add_line(None, &pattern).unwrap();
}
let ignore = ignore_builder.build().unwrap();
builder.add_gitignore(ignore);
}
let auto_bases: Vec<PathBuf> = sources
.iter()
.filter_map(|source| match source {
SourceEntry::Auto { base } | SourceEntry::External { base } => Some(base.clone()),
_ => None,
})
.collect();
let pattern_sources: Vec<(PathBuf, String)> = sources
.iter()
.filter_map(|source| match source {
SourceEntry::Pattern { base, pattern } => {
let normalized = if pattern.starts_with("/") {
pattern.to_string()
} else {
format!("/{pattern}")
};
Some((base.clone(), normalized))
}
_ => None,
})
.collect();
builder.filter_entry(move |entry| {
let path = entry.path();
if path.is_file() {
let mut matches = false;
for base in &auto_bases {
if path.starts_with(base) {
matches = true;
break;
}
}
if !matches {
for (base, pattern) in &pattern_sources {
let remainder = path.strip_prefix(base);
if remainder.is_ok_and(|remainder| {
let mut path_str = remainder.to_string_lossy().to_string();
if !path_str.starts_with("/") {
path_str = format!("/{path_str}");
}
glob_match(pattern, path_str.as_bytes())
}) {
matches = true;
break;
}
}
}
if !matches {
return false;
}
}
true
});
Some(builder)
}
#[cfg(test)]
mod tests {
use super::{ChangedContent, Scanner};
use pretty_assertions::assert_eq;
#[test]
fn test_positions() {
let mut scanner = Scanner::new(vec![]);
for (input, expected) in [
(
r#"<div class="!tw__flex sm:!tw__block tw__bg-gradient-to-t flex tw:[color:red] group-[]:tw__flex"#,
vec![
("class".to_string(), 5),
("!tw__flex".to_string(), 12),
("sm:!tw__block".to_string(), 22),
("tw__bg-gradient-to-t".to_string(), 36),
("flex".to_string(), 57),
("tw:[color:red]".to_string(), 62),
("group-[]:tw__flex".to_string(), 77),
],
),
(
r#"<div class="tw:flex! tw:sm:block! tw:bg-linear-to-t flex tw:[color:red] tw:in-[.tw\:group]:flex"></div>"#,
vec![
("class".to_string(), 5),
("tw:flex!".to_string(), 12),
("tw:sm:block!".to_string(), 21),
("tw:bg-linear-to-t".to_string(), 34),
("flex".to_string(), 52),
("tw:[color:red]".to_string(), 57),
("tw:in-[.tw\\:group]:flex".to_string(), 72),
],
),
] {
let candidates = scanner.get_candidates_with_positions(ChangedContent::Content(
input.to_string(),
"html".into(),
));
assert_eq!(candidates, expected);
}
}
}