use crate::stats::DiscoveryStats; use crate::types::FileRecord; use anyhow::Result; use ignore::WalkBuilder; use std::path::Path; use std::time::{Instant, UNIX_EPOCH}; /// Step 1: Discovery - find all files respecting ignore patterns const DEFAULT_IGNORES: &[&str] = &[ ".git/**", "node_modules/**", "dist/**", "build/**", "target/**", "**/*.lock", "*-lock.json", "*.lock", ".vscode/**", ".idea/**", "__pycache__/**", "*.pyc", ".DS_Store", ]; const MAX_INDEXABLE_BYTES: u64 = 2_000_000; // 2MB pub fn discover>( root: P, verbose: bool, ) -> Result<(Vec, DiscoveryStats)> { let start = Instant::now(); let root = root.as_ref(); if verbose { println!("[Discovery] Scanning directory: {}", root.display()); } let mut files = Vec::new(); let mut skipped = 0; let mut total_bytes = 0u64; let walker = WalkBuilder::new(root) .standard_filters(true) // Respects .gitignore, .ignore, etc. .hidden(false) // Don't skip hidden files by default .build(); for entry_result in walker { let entry = match entry_result { Ok(e) => e, Err(e) => { eprintln!("Error walking directory: {}", e); continue; } }; // Skip directories if entry.file_type().map_or(true, |ft| ft.is_dir()) { continue; } let path = entry.path(); // Check against default ignores if should_ignore(path) { skipped += 1; continue; } let metadata = match std::fs::metadata(path) { Ok(m) => m, Err(e) => { eprintln!("Error reading metadata for {}: {}", path.display(), e); continue; } }; let size = metadata.len(); // Skip files that are too large if size > MAX_INDEXABLE_BYTES { if verbose { eprintln!( "[Discovery] Skipping large file: {} ({} bytes)", path.display(), size ); } skipped += 1; continue; } total_bytes += size; let modified_time = metadata .modified() .ok() .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) .map(|d| d.as_secs()) .unwrap_or(0); // Compute fingerprint (hash of content) let fingerprint = match compute_fingerprint(path) { Ok(fp) => fp, Err(e) => { eprintln!("Error computing fingerprint for {}: {}", path.display(), e); continue; } }; files.push(FileRecord { path: path.to_path_buf(), size, modified_time, fingerprint, }); } let stats = DiscoveryStats { files_found: files.len(), files_skipped: skipped, total_bytes, duration_ms: start.elapsed().as_millis() as u64, }; if verbose { println!( "[Discovery] Complete: {} files found, {} skipped, {:.2} MB total", files.len(), skipped, total_bytes as f64 / 1_048_576.0 ); } Ok((files, stats)) } fn should_ignore(path: &Path) -> bool { let path_str = path.to_string_lossy(); let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); for pattern in DEFAULT_IGNORES { if pattern.ends_with("/**") { let prefix = pattern.trim_end_matches("/**"); // Check if the path contains this directory if path_str.contains(&format!("/{}/", prefix)) || path_str.contains(&format!("\\{}\\", prefix)) || path_str.contains(&format!("/{}", prefix)) // At start || path_str.starts_with(&format!("{}\\", prefix)) || path_str.starts_with(&format!("{}/", prefix)) { return true; } } else if pattern.starts_with("**/*.") { let ext = pattern.trim_start_matches("**/"); if file_name.ends_with(ext) { return true; } } else if pattern.starts_with("*.") { if file_name.ends_with(pattern.trim_start_matches('*')) { return true; } } else if pattern.starts_with('*') && pattern.contains('.') { // Pattern like *-lock.json let suffix = pattern.trim_start_matches('*'); if file_name.ends_with(suffix) { return true; } } else if path_str.ends_with(pattern) || file_name == *pattern { return true; } } false } fn compute_fingerprint(path: &Path) -> Result { let content = std::fs::read(path)?; let hash = blake3::hash(&content); Ok(hash.to_hex()[..16].to_string()) // Use first 16 chars for brevity } #[cfg(test)] mod tests { use super::*; #[test] fn test_should_ignore() { assert!(should_ignore(Path::new("node_modules/package/index.js"))); assert!(should_ignore(Path::new(".git/config"))); assert!(should_ignore(Path::new("target/debug/app.exe"))); assert!(should_ignore(Path::new("package-lock.json"))); assert!(!should_ignore(Path::new("src/main.rs"))); assert!(!should_ignore(Path::new("README.md"))); } }