Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
split static and dynamic part of glob, then resolve
  • Loading branch information
RobinMalfait authored and thecrypticace committed Oct 28, 2024
commit eb7e5e69889093fa8b842b34d31d3e6365d23f5b
203 changes: 114 additions & 89 deletions crates/oxide/src/glob.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,64 @@
use fxhash::{FxHashMap, FxHashSet};
use glob_match::glob_match;
use std::iter;
use std::path::{Path, PathBuf};
use tracing::event;

use crate::GlobEntry;

pub fn fast_glob(
patterns: &Vec<GlobEntry>,
) -> Result<impl iter::Iterator<Item = PathBuf>, std::io::Error> {
Ok(get_fast_patterns(patterns)
Ok(optimize_patterns(patterns)
.into_iter()
.flat_map(|(base_path, patterns)| {
globwalk::GlobWalkerBuilder::from_patterns(base_path, &patterns)
.follow_links(true)
.build()
.unwrap()
.filter_map(Result::ok)
.map(|file| file.path().to_path_buf())
.flat_map(|glob_entry| {
globwalk::GlobWalkerBuilder::from_patterns(
glob_entry.base,
&[glob_entry.pattern.as_str()][..],
)
.follow_links(true)
.build()
.unwrap()
.filter_map(Result::ok)
.map(|file| file.path().to_path_buf())
}))
}

pub fn hoist_static_glob_parts(entries: &Vec<GlobEntry>) -> Vec<GlobEntry> {
let mut result = vec![];

for entry in entries {
let (static_part, dynamic_part) = split_pattern(&entry.pattern);

let base: PathBuf = entry.base.clone().into();
let base = match static_part {
Some(static_part) => base.join(static_part),
None => base,
};

let base = match dunce::canonicalize(&base) {
Ok(base) => base,
Err(err) => {
event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err);
// If we can't resolve the new base on disk, let's just skip this entry.
continue;
}
};

let pattern = match dynamic_part {
Some(dynamic_part) => dynamic_part,
None => "**/*".to_owned(),
};

result.push(GlobEntry {
base: base.to_string_lossy().to_string(),
pattern,
});
}

result
}

/// This function attempts to optimize the glob patterns to improve performance. The problem is
/// that if you run the following command:
/// ```sh
Expand All @@ -42,98 +82,83 @@ pub fn fast_glob(
/// tailwind --pwd ./project/pages --content "**/*.js"
/// tailwind --pwd ./project/components --content "**/*.js"
/// ```
pub fn get_fast_patterns(patterns: &Vec<GlobEntry>) -> Vec<(PathBuf, Vec<String>)> {
let mut optimized_patterns: Vec<(PathBuf, Vec<String>)> = vec![];
pub fn optimize_patterns(entries: &Vec<GlobEntry>) -> Vec<GlobEntry> {
let entries = hoist_static_glob_parts(entries);

for pattern in patterns {
let base_path = PathBuf::from(&pattern.base);
let pattern = &pattern.pattern;
// Track all base paths and their patterns. Later we will turn them back into `GlobalEntry`s.
let mut pattern_map: FxHashMap<String, FxHashSet<String>> = FxHashMap::default();

let is_negated = pattern.starts_with('!');
let mut pattern = pattern.clone();
if is_negated {
pattern.remove(0);
}
for glob_entry in entries {
let entry = pattern_map.entry(glob_entry.base).or_default();
entry.insert(glob_entry.pattern.clone());
}

let mut folders = pattern.split('/').collect::<Vec<_>>();

if folders.len() <= 1 {
// No paths we can simplify, so let's use it as-is.
optimized_patterns.push((base_path, vec![pattern]));
} else {
// We do have folders because `/` exists. Let's try to simplify the globs!
// Safety: We know that the length is greater than 1, so we can safely unwrap.
let file_pattern = folders.pop().unwrap();
let all_folders = folders.clone();
let mut temp_paths = vec![base_path];

let mut bail = false;

for (i, folder) in folders.into_iter().enumerate() {
// There is a wildcard in the folder, so we have to bail now... 😢 But this also
// means that we can skip looking at the rest of the folders, so there is at least
// this small optimization we can apply!
if folder.contains('*') {
// Get all the remaining folders, attach the existing file_pattern so that this
// can now be the final pattern we use.
let mut remaining_folders = all_folders[i..].to_vec();
remaining_folders.push(file_pattern);

let pattern = remaining_folders.join("/");
for path in &temp_paths {
optimized_patterns.push((path.to_path_buf(), vec![pattern.to_string()]));
}

bail = true;
break;
}
// TODO: Optimization, if any of the patterns result in `**/*`, then we can do two things:
// 1. All base paths in the pattern_map, that start with the current base path, can be removed.
// 2. All patterns that are not `**/*` can be removed from the current base path.

// The folder is very likely using an expandable pattern which we can expand!
if folder.contains('{') && folder.contains('}') {
let branches = expand_braces(folder);

let existing_paths = temp_paths;
temp_paths = branches
.iter()
.flat_map(|branch| {
existing_paths
.clone()
.into_iter()
.map(|path| path.join(branch))
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
}
// The folder should just be a simple folder name without any glob magic. We should
// be able to safely add it to the existing paths.
else {
temp_paths = temp_paths
.into_iter()
.map(|path| path.join(folder))
.collect();
}
pattern_map
.into_iter()
.map(|(base, patterns)| {
let size = patterns.len();
let mut patterns = patterns.into_iter().collect::<Vec<_>>();
patterns.sort();
let combined_patterns = patterns.join(",");

// TODO: Right now this will generate something like `{**/*.html,**/*.js}`, but maybe
// we want to generate this instead:`**/*.{html,js}`.

GlobEntry {
base,
pattern: match size {
1 => combined_patterns,
_ => format!("{{{}}}", combined_patterns),
},
}
})
.collect::<Vec<GlobEntry>>()
}

// As long as we didn't bail, we can now add the current expanded patterns to the
// optimized patterns.
if !bail {
for path in &temp_paths {
optimized_patterns.push((path.to_path_buf(), vec![file_pattern.to_string()]));
}
}
// Split a glob pattern into a `static` and `dynamic` part.
//
// Assumption: we assume that all globs are expanded, which means that the only dynamic parts are
// using `*`.
//
// E.g.:
// Original input: `../project-b/**/*.{html,js}`
// Expanded input: `../project-b/**/*.html` & `../project-b/**/*.js`
// Split on first input: ("../project-b", "**/*.html")
// Split on second input: ("../project-b", "**/*.js")
fn split_pattern(input: &str) -> (Option<String>, Option<String>) {
// No dynamic parts, so we can just return the input as-is.
if !input.contains('*') {
return (Some(input.to_owned()), None);
}

let mut last_slash_position = None;

for (i, c) in input.char_indices() {
if c == '/' {
last_slash_position = Some(i);
}

// Ensure that we re-add all the `!` signs to the patterns.
if is_negated {
for (_, patterns) in &mut optimized_patterns {
for pattern in patterns {
pattern.insert(0, '!');
}
}
if c == '*' {
break;
}
}

optimized_patterns
// Very first character is a `*`, therefore there is no static part, only a dynamic part.
let Some(last_slash_position) = last_slash_position else {
return (None, Some(input.to_owned()));
};

let static_part = input[..last_slash_position].to_owned();
let dynamic_part = input[last_slash_position + 1..].to_owned();

let static_part = (!static_part.is_empty()).then_some(static_part);
let dynamic_part = (!dynamic_part.is_empty()).then_some(dynamic_part);

(static_part, dynamic_part)
}

pub fn path_matches_globs(path: &Path, globs: &[GlobEntry]) -> bool {
Expand Down
76 changes: 36 additions & 40 deletions crates/oxide/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
use crate::glob::hoist_static_glob_parts;
use crate::parser::Extractor;
use crate::scanner::allowed_paths::resolve_paths;
use crate::scanner::detect_sources::DetectSources;
use bexpand::Expression;
use bstr::ByteSlice;
use fxhash::{FxHashMap, FxHashSet};
use glob::fast_glob;
use glob::get_fast_patterns;
use glob::optimize_patterns;
use globset::Glob;
use rayon::prelude::*;
use std::fs;
use std::path::PathBuf;
Expand Down Expand Up @@ -255,9 +257,6 @@ impl Scanner {
false
});

// Turn `Vec<&GlobEntry>` in `Vec<GlobEntry>`
let glob_sources: Vec<_> = glob_sources.into_iter().cloned().collect();

for path in auto_sources
.iter()
.map(|source| PathBuf::from(&source.base).join(source.pattern.trim_end_matches("**/*")))
Expand All @@ -269,46 +268,43 @@ impl Scanner {
self.globs.extend(globs);
}

let resolved_files: Vec<_> = match fast_glob(&glob_sources) {
Ok(matches) => matches
.filter_map(|x| dunce::canonicalize(&x).ok())
.collect(),
Err(err) => {
event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err);
vec![]
}
};
// Turn `Vec<&GlobEntry>` in `Vec<GlobEntry>`
let glob_sources: Vec<_> = glob_sources.into_iter().cloned().collect();
let hoisted = hoist_static_glob_parts(&glob_sources);

self.files.extend(resolved_files);
self.globs.extend(glob_sources);
for source in &hoisted {
let Ok(glob) = Glob::new(&source.base) else {
continue;
};

// Re-optimize the globs to reduce the number of patterns we have to scan.
self.globs = get_fast_patterns(&self.globs)
.into_iter()
.filter_map(|(root, globs)| {
let root = match dunce::canonicalize(root) {
Ok(root) => root,
Err(error) => {
event!(
tracing::Level::ERROR,
"Failed to canonicalize base path {:?}",
error
);
return None;
}
let glob = glob.compile_matcher();

let base = PathBuf::from(&source.base);
for entry in resolve_paths(&base) {
let Some(file_type) = entry.file_type() else {
continue;
};

Some((root, globs))
})
.flat_map(|(root, globs)| {
let base = root.display().to_string();
if !file_type.is_file() {
continue;
}

globs.into_iter().map(move |glob| GlobEntry {
base: base.clone(),
pattern: glob,
})
})
.collect::<Vec<GlobEntry>>();
let file_path = entry.into_path();

let Some(file_path_str) = file_path.to_str() else {
continue;
};

if glob.is_match(file_path_str) {
self.files.push(file_path);
}
}
}

self.globs.extend(hoisted);

// Re-optimize the globs to reduce the number of patterns we have to scan.
self.globs = optimize_patterns(&self.globs);
}
}

Expand Down
11 changes: 10 additions & 1 deletion crates/oxide/src/scanner/allowed_paths.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub fn resolve_allowed_paths(root: &Path) -> impl Iterator<Item = DirEntry> {
WalkBuilder::new(root)
.hidden(false)
.require_git(false)
.filter_entry(|entry| match entry.file_type() {
.filter_entry(move |entry| match entry.file_type() {
Some(file_type) if file_type.is_dir() => match entry.file_name().to_str() {
Some(dir) => !IGNORED_CONTENT_DIRS.contains(&dir),
None => false,
Expand All @@ -44,6 +44,15 @@ pub fn resolve_allowed_paths(root: &Path) -> impl Iterator<Item = DirEntry> {
.filter_map(Result::ok)
}

#[tracing::instrument(skip(root))]
pub fn resolve_paths(root: &Path) -> impl Iterator<Item = DirEntry> {
WalkBuilder::new(root)
.hidden(false)
.require_git(false)
.build()
.filter_map(Result::ok)
}

pub fn is_allowed_content_path(path: &Path) -> bool {
// Skip known ignored files
if path
Expand Down