197 lines
5.4 KiB
Rust
197 lines
5.4 KiB
Rust
use crate::stats::DiscoveryStats;
|
|
use crate::types::FileRecord;
|
|
use anyhow::Result;
|
|
use ignore::WalkBuilder;
|
|
use std::path::Path;
|
|
use std::time::{Instant, UNIX_EPOCH};
|
|
|
|
/// Step 1: Discovery - find all files respecting ignore patterns
|
|
|
|
const DEFAULT_IGNORES: &[&str] = &[
|
|
".git/**",
|
|
"node_modules/**",
|
|
"dist/**",
|
|
"build/**",
|
|
"target/**",
|
|
"**/*.lock",
|
|
"*-lock.json",
|
|
"*.lock",
|
|
".vscode/**",
|
|
".idea/**",
|
|
"__pycache__/**",
|
|
"*.pyc",
|
|
".DS_Store",
|
|
];
|
|
|
|
const MAX_INDEXABLE_BYTES: u64 = 2_000_000; // 2MB
|
|
|
|
pub fn discover<P: AsRef<Path>>(
|
|
root: P,
|
|
verbose: bool,
|
|
) -> Result<(Vec<FileRecord>, DiscoveryStats)> {
|
|
let start = Instant::now();
|
|
let root = root.as_ref();
|
|
|
|
if verbose {
|
|
println!("[Discovery] Scanning directory: {}", root.display());
|
|
}
|
|
|
|
let mut files = Vec::new();
|
|
let mut skipped = 0;
|
|
let mut total_bytes = 0u64;
|
|
|
|
let walker = WalkBuilder::new(root)
|
|
.standard_filters(true) // Respects .gitignore, .ignore, etc.
|
|
.hidden(false) // Don't skip hidden files by default
|
|
.build();
|
|
|
|
for entry_result in walker {
|
|
let entry = match entry_result {
|
|
Ok(e) => e,
|
|
Err(e) => {
|
|
eprintln!("Error walking directory: {}", e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
// Skip directories
|
|
if entry.file_type().map_or(true, |ft| ft.is_dir()) {
|
|
continue;
|
|
}
|
|
|
|
let path = entry.path();
|
|
|
|
// Check against default ignores
|
|
if should_ignore(path) {
|
|
skipped += 1;
|
|
continue;
|
|
}
|
|
|
|
let metadata = match std::fs::metadata(path) {
|
|
Ok(m) => m,
|
|
Err(e) => {
|
|
eprintln!("Error reading metadata for {}: {}", path.display(), e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let size = metadata.len();
|
|
|
|
// Skip files that are too large
|
|
if size > MAX_INDEXABLE_BYTES {
|
|
if verbose {
|
|
eprintln!(
|
|
"[Discovery] Skipping large file: {} ({} bytes)",
|
|
path.display(),
|
|
size
|
|
);
|
|
}
|
|
skipped += 1;
|
|
continue;
|
|
}
|
|
|
|
total_bytes += size;
|
|
|
|
let modified_time = metadata
|
|
.modified()
|
|
.ok()
|
|
.and_then(|t| t.duration_since(UNIX_EPOCH).ok())
|
|
.map(|d| d.as_secs())
|
|
.unwrap_or(0);
|
|
|
|
// Compute fingerprint (hash of content)
|
|
let fingerprint = match compute_fingerprint(path) {
|
|
Ok(fp) => fp,
|
|
Err(e) => {
|
|
eprintln!("Error computing fingerprint for {}: {}", path.display(), e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
files.push(FileRecord {
|
|
path: path.to_path_buf(),
|
|
size,
|
|
modified_time,
|
|
fingerprint,
|
|
});
|
|
}
|
|
|
|
let stats = DiscoveryStats {
|
|
files_found: files.len(),
|
|
files_skipped: skipped,
|
|
total_bytes,
|
|
duration_ms: start.elapsed().as_millis() as u64,
|
|
};
|
|
|
|
if verbose {
|
|
println!(
|
|
"[Discovery] Complete: {} files found, {} skipped, {:.2} MB total",
|
|
files.len(),
|
|
skipped,
|
|
total_bytes as f64 / 1_048_576.0
|
|
);
|
|
}
|
|
|
|
Ok((files, stats))
|
|
}
|
|
|
|
fn should_ignore(path: &Path) -> bool {
|
|
let path_str = path.to_string_lossy();
|
|
let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
|
|
|
for pattern in DEFAULT_IGNORES {
|
|
if pattern.ends_with("/**") {
|
|
let prefix = pattern.trim_end_matches("/**");
|
|
// Check if the path contains this directory
|
|
if path_str.contains(&format!("/{}/", prefix))
|
|
|| path_str.contains(&format!("\\{}\\", prefix))
|
|
|| path_str.contains(&format!("/{}", prefix)) // At start
|
|
|| path_str.starts_with(&format!("{}\\", prefix))
|
|
|| path_str.starts_with(&format!("{}/", prefix))
|
|
{
|
|
return true;
|
|
}
|
|
} else if pattern.starts_with("**/*.") {
|
|
let ext = pattern.trim_start_matches("**/");
|
|
if file_name.ends_with(ext) {
|
|
return true;
|
|
}
|
|
} else if pattern.starts_with("*.") {
|
|
if file_name.ends_with(pattern.trim_start_matches('*')) {
|
|
return true;
|
|
}
|
|
} else if pattern.starts_with('*') && pattern.contains('.') {
|
|
// Pattern like *-lock.json
|
|
let suffix = pattern.trim_start_matches('*');
|
|
if file_name.ends_with(suffix) {
|
|
return true;
|
|
}
|
|
} else if path_str.ends_with(pattern) || file_name == *pattern {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
fn compute_fingerprint(path: &Path) -> Result<String> {
|
|
let content = std::fs::read(path)?;
|
|
let hash = blake3::hash(&content);
|
|
Ok(hash.to_hex()[..16].to_string()) // Use first 16 chars for brevity
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_should_ignore() {
|
|
assert!(should_ignore(Path::new("node_modules/package/index.js")));
|
|
assert!(should_ignore(Path::new(".git/config")));
|
|
assert!(should_ignore(Path::new("target/debug/app.exe")));
|
|
assert!(should_ignore(Path::new("package-lock.json")));
|
|
assert!(!should_ignore(Path::new("src/main.rs")));
|
|
assert!(!should_ignore(Path::new("README.md")));
|
|
}
|
|
}
|