temp-deepwiki/src/discover.rs
2025-10-01 18:01:57 +07:00

197 lines
5.4 KiB
Rust

use crate::stats::DiscoveryStats;
use crate::types::FileRecord;
use anyhow::Result;
use ignore::WalkBuilder;
use std::path::Path;
use std::time::{Instant, UNIX_EPOCH};
/// Step 1: Discovery - find all files respecting ignore patterns
const DEFAULT_IGNORES: &[&str] = &[
".git/**",
"node_modules/**",
"dist/**",
"build/**",
"target/**",
"**/*.lock",
"*-lock.json",
"*.lock",
".vscode/**",
".idea/**",
"__pycache__/**",
"*.pyc",
".DS_Store",
];
const MAX_INDEXABLE_BYTES: u64 = 2_000_000; // 2MB
pub fn discover<P: AsRef<Path>>(
root: P,
verbose: bool,
) -> Result<(Vec<FileRecord>, DiscoveryStats)> {
let start = Instant::now();
let root = root.as_ref();
if verbose {
println!("[Discovery] Scanning directory: {}", root.display());
}
let mut files = Vec::new();
let mut skipped = 0;
let mut total_bytes = 0u64;
let walker = WalkBuilder::new(root)
.standard_filters(true) // Respects .gitignore, .ignore, etc.
.hidden(false) // Don't skip hidden files by default
.build();
for entry_result in walker {
let entry = match entry_result {
Ok(e) => e,
Err(e) => {
eprintln!("Error walking directory: {}", e);
continue;
}
};
// Skip directories
if entry.file_type().map_or(true, |ft| ft.is_dir()) {
continue;
}
let path = entry.path();
// Check against default ignores
if should_ignore(path) {
skipped += 1;
continue;
}
let metadata = match std::fs::metadata(path) {
Ok(m) => m,
Err(e) => {
eprintln!("Error reading metadata for {}: {}", path.display(), e);
continue;
}
};
let size = metadata.len();
// Skip files that are too large
if size > MAX_INDEXABLE_BYTES {
if verbose {
eprintln!(
"[Discovery] Skipping large file: {} ({} bytes)",
path.display(),
size
);
}
skipped += 1;
continue;
}
total_bytes += size;
let modified_time = metadata
.modified()
.ok()
.and_then(|t| t.duration_since(UNIX_EPOCH).ok())
.map(|d| d.as_secs())
.unwrap_or(0);
// Compute fingerprint (hash of content)
let fingerprint = match compute_fingerprint(path) {
Ok(fp) => fp,
Err(e) => {
eprintln!("Error computing fingerprint for {}: {}", path.display(), e);
continue;
}
};
files.push(FileRecord {
path: path.to_path_buf(),
size,
modified_time,
fingerprint,
});
}
let stats = DiscoveryStats {
files_found: files.len(),
files_skipped: skipped,
total_bytes,
duration_ms: start.elapsed().as_millis() as u64,
};
if verbose {
println!(
"[Discovery] Complete: {} files found, {} skipped, {:.2} MB total",
files.len(),
skipped,
total_bytes as f64 / 1_048_576.0
);
}
Ok((files, stats))
}
fn should_ignore(path: &Path) -> bool {
let path_str = path.to_string_lossy();
let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
for pattern in DEFAULT_IGNORES {
if pattern.ends_with("/**") {
let prefix = pattern.trim_end_matches("/**");
// Check if the path contains this directory
if path_str.contains(&format!("/{}/", prefix))
|| path_str.contains(&format!("\\{}\\", prefix))
|| path_str.contains(&format!("/{}", prefix)) // At start
|| path_str.starts_with(&format!("{}\\", prefix))
|| path_str.starts_with(&format!("{}/", prefix))
{
return true;
}
} else if pattern.starts_with("**/*.") {
let ext = pattern.trim_start_matches("**/");
if file_name.ends_with(ext) {
return true;
}
} else if pattern.starts_with("*.") {
if file_name.ends_with(pattern.trim_start_matches('*')) {
return true;
}
} else if pattern.starts_with('*') && pattern.contains('.') {
// Pattern like *-lock.json
let suffix = pattern.trim_start_matches('*');
if file_name.ends_with(suffix) {
return true;
}
} else if path_str.ends_with(pattern) || file_name == *pattern {
return true;
}
}
false
}
fn compute_fingerprint(path: &Path) -> Result<String> {
let content = std::fs::read(path)?;
let hash = blake3::hash(&content);
Ok(hash.to_hex()[..16].to_string()) // Use first 16 chars for brevity
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_should_ignore() {
assert!(should_ignore(Path::new("node_modules/package/index.js")));
assert!(should_ignore(Path::new(".git/config")));
assert!(should_ignore(Path::new("target/debug/app.exe")));
assert!(should_ignore(Path::new("package-lock.json")));
assert!(!should_ignore(Path::new("src/main.rs")));
assert!(!should_ignore(Path::new("README.md")));
}
}