1use crate::history::PdfFile;
8use crate::scanner_skip_dirs::SCANNER_SKIP_DIRS as SKIP_DIRS;
9use crate::unified_walker::IncrementalDirState;
10use rayon::prelude::*;
11use dashmap::DashSet;
12use std::collections::HashSet;
13use std::fs;
14use std::path::{Path, PathBuf};
15use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
16use std::sync::{Arc, Mutex};
17
18fn normalize_macos_path(p: PathBuf) -> PathBuf {
19 #[cfg(target_os = "macos")]
20 {
21 let s = p.to_string_lossy();
22 if s.starts_with("/System/Volumes/Data/") {
23 return PathBuf::from(&s["/System/Volumes/Data".len()..]);
24 }
25 }
26 p
27}
28
29const PDF_EXTENSION: &str = ".pdf";
30
31fn format_size(bytes: u64) -> String {
32 crate::format_size(bytes)
33}
34
35pub fn get_pdf_roots() -> Vec<PathBuf> {
37 let home = dirs::home_dir().unwrap_or_default();
38 if home.as_os_str().is_empty() || !home.exists() {
39 return Vec::new();
40 }
41 vec![home]
42}
43
44pub fn walk_for_pdfs(
45 roots: &[PathBuf],
46 on_batch: &mut dyn FnMut(&[PdfFile], usize),
47 should_stop: &(dyn Fn() -> bool + Sync),
48 exclude: Option<HashSet<String>>,
49 active_dirs: Option<Arc<Mutex<Vec<String>>>>,
50 incremental: Option<Arc<IncrementalDirState>>,
51) {
52 let batch_size = 100;
53 let stop = Arc::new(AtomicBool::new(false));
54 let found = Arc::new(AtomicUsize::new(0));
55 let active = active_dirs.unwrap_or_else(|| Arc::new(Mutex::new(Vec::new())));
56 let (tx, rx) = std::sync::mpsc::sync_channel::<Vec<PdfFile>>(256);
57 let visited = Arc::new(DashSet::new());
58 let exclude = Arc::new(exclude.unwrap_or_default());
59
60 let roots_owned: Vec<PathBuf> = roots.to_vec();
61 let stop2 = stop.clone();
62 let found2 = found.clone();
63 let incremental = incremental.clone();
64 let pool = rayon::ThreadPoolBuilder::new()
65 .num_threads(num_cpus::get().max(4))
66 .build()
67 .unwrap();
68 std::thread::spawn(move || {
69 pool.install(|| {
70 roots_owned.par_iter().for_each(|root| {
71 if stop2.load(Ordering::Relaxed) {
72 return;
73 }
74 walk_dir_parallel(
75 root,
76 0,
77 &visited,
78 &tx,
79 &found2,
80 batch_size,
81 &stop2,
82 &exclude,
83 &active,
84 incremental.clone(),
85 );
86 });
87 });
88 drop(pool);
89 });
90
91 let mut total_found = 0usize;
92 loop {
93 if should_stop() {
94 stop.store(true, Ordering::Relaxed);
95 while rx.try_recv().is_ok() {}
96 break;
97 }
98 match rx.recv_timeout(std::time::Duration::from_millis(10)) {
99 Ok(pdfs) => {
100 total_found += pdfs.len();
101 on_batch(&pdfs, total_found);
102 }
103 Err(std::sync::mpsc::RecvTimeoutError::Timeout) => continue,
104 Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => break,
105 }
106 }
107}
108
109#[allow(clippy::too_many_arguments)]
110fn walk_dir_parallel(
111 dir: &Path,
112 depth: u32,
113 visited: &Arc<DashSet<PathBuf>>,
114 tx: &std::sync::mpsc::SyncSender<Vec<PdfFile>>,
115 found: &Arc<AtomicUsize>,
116 batch_size: usize,
117 stop: &Arc<AtomicBool>,
118 exclude: &Arc<HashSet<String>>,
119 active_dirs: &Arc<Mutex<Vec<String>>>,
120 incremental: Option<Arc<IncrementalDirState>>,
121) {
122 if depth > 30 || stop.load(Ordering::Relaxed) {
123 return;
124 }
125
126 {
127 let orig = normalize_macos_path(dir.to_path_buf());
128 let canon = fs::canonicalize(dir).ok().map(normalize_macos_path);
129 let key = canon.unwrap_or_else(|| orig.clone());
130 if !visited.insert(key) {
131 return;
132 }
133 visited.insert(orig);
134 }
135
136 if let Some(ref inc) = incremental {
137 if inc.should_skip(dir) {
138 return;
139 }
140 }
141
142 let dir_str = dir.to_string_lossy().to_string();
143 {
144 let mut ad = active_dirs.lock().unwrap_or_else(|e| e.into_inner());
145 ad.push(dir_str.clone());
146 if ad.len() > 200 {
147 let excess = ad.len() - 200;
148 ad.drain(..excess);
149 }
150 }
151
152 let entries: Vec<_> = match fs::read_dir(dir) {
153 Ok(e) => e.flatten().collect(),
154 Err(_e) => {
155 return;
156 }
157 };
158
159 let mut files = Vec::new();
160 let mut subdirs = Vec::new();
161
162 for entry in &entries {
163 let name = entry.file_name();
164 let name_str = name.to_string_lossy();
165 if name_str.starts_with('.')
167 || name_str.starts_with('@')
168 || SKIP_DIRS.contains(&name_str.as_ref())
169 || exclude.contains(name_str.as_ref())
170 {
171 continue;
172 }
173 let ft = match entry.file_type() {
175 Ok(f) => f,
176 Err(_) => continue,
177 };
178 let path = entry.path();
179 if ft.is_dir() {
180 subdirs.push(path);
181 } else if ft.is_file() {
182 files.push((path, dir.to_path_buf()));
183 } else if ft.is_symlink() {
184 match fs::metadata(&path) {
185 Ok(m) if m.is_dir() => {
186 subdirs.push(path);
187 }
188 Ok(m) if m.is_file() => {
189 files.push((path, dir.to_path_buf()));
190 }
191 _ => {}
192 }
193 }
194 }
195
196 let mut batch = Vec::new();
197 for (path, parent) in files {
198 let ext = path
199 .extension()
200 .map(|e| format!(".{}", e.to_string_lossy().to_lowercase()))
201 .unwrap_or_default();
202
203 if ext == PDF_EXTENSION {
204 let path_str = path.to_string_lossy().to_string();
205 if exclude.contains(&path_str) {
206 continue;
207 }
208 if let Ok(meta) = fs::metadata(&path) {
209 let pdf_name = path
210 .file_stem()
211 .map(|s| s.to_string_lossy().to_string())
212 .unwrap_or_default();
213 let modified = meta
214 .modified()
215 .ok()
216 .map(|t| {
217 let dt: chrono::DateTime<chrono::Utc> = t.into();
218 dt.format("%Y-%m-%d").to_string()
219 })
220 .unwrap_or_default();
221
222 batch.push(PdfFile {
223 name: pdf_name,
224 path: path_str,
225 directory: parent.to_string_lossy().to_string(),
226 size: meta.len(),
227 size_formatted: format_size(meta.len()),
228 modified,
229 });
230 found.fetch_add(1, Ordering::Relaxed);
231
232 if batch.len() >= batch_size {
233 let _ = tx.send(batch);
234 batch = Vec::new();
235 }
236 }
237 }
238 }
239 if !batch.is_empty() {
240 let _ = tx.send(batch);
241 }
242
243 subdirs.par_iter().for_each(|subdir| {
244 walk_dir_parallel(
245 subdir,
246 depth + 1,
247 visited,
248 tx,
249 found,
250 batch_size,
251 stop,
252 exclude,
253 active_dirs,
254 incremental.clone(),
255 );
256 });
257
258 if let Some(ref inc) = incremental {
259 inc.record_scanned_dir(dir);
260 }
261}
262
263#[cfg(test)]
264mod tests {
265 use super::*;
266 use std::slice::from_ref;
267
268 #[test]
269 fn test_pdf_extension_constant() {
270 assert_eq!(PDF_EXTENSION, ".pdf");
271 }
272
273 #[test]
274 fn test_get_pdf_roots_returns_existing_paths() {
275 let roots = get_pdf_roots();
276 for r in &roots {
277 assert!(r.exists(), "returned root should exist: {:?}", r);
278 }
279 }
280
281 #[test]
282 fn test_walk_for_pdfs_empty_dir() {
283 let tmp = std::env::temp_dir().join("upum_test_pdf_empty");
284 let _ = fs::remove_dir_all(&tmp);
285 fs::create_dir_all(&tmp).unwrap();
286 let mut found = Vec::new();
287 walk_for_pdfs(
288 from_ref(&tmp),
289 &mut |batch, _| found.extend_from_slice(batch),
290 &|| false,
291 None,
292 None,
293 None,
294 );
295 assert!(found.is_empty());
296 let _ = fs::remove_dir_all(&tmp);
297 }
298
299 #[test]
300 fn test_walk_for_pdfs_finds_files() {
301 let tmp = std::env::temp_dir().join("upum_test_pdf_find");
302 let _ = fs::remove_dir_all(&tmp);
303 fs::create_dir_all(&tmp).unwrap();
304 fs::write(tmp.join("manual.pdf"), b"%PDF-1.4").unwrap();
305 fs::write(tmp.join("book.PDF"), b"%PDF-1.4").unwrap();
306 fs::write(tmp.join("notes.txt"), b"nope").unwrap();
307
308 let mut found = Vec::new();
309 walk_for_pdfs(
310 from_ref(&tmp),
311 &mut |batch, _| found.extend_from_slice(batch),
312 &|| false,
313 None,
314 None,
315 None,
316 );
317 assert_eq!(found.len(), 2);
318 assert!(found.iter().any(|p| p.name == "manual"));
319 assert!(found.iter().any(|p| p.name == "book"));
320 let _ = fs::remove_dir_all(&tmp);
321 }
322
323 #[test]
324 fn test_walk_for_pdfs_skips_hidden_and_blacklisted() {
325 let tmp = std::env::temp_dir().join("upum_test_pdf_skip");
326 let _ = fs::remove_dir_all(&tmp);
327 fs::create_dir_all(tmp.join(".hidden")).unwrap();
328 fs::create_dir_all(tmp.join("node_modules")).unwrap();
329 fs::create_dir_all(tmp.join("ok")).unwrap();
330 fs::write(tmp.join(".hidden/a.pdf"), b"h").unwrap();
331 fs::write(tmp.join("node_modules/b.pdf"), b"n").unwrap();
332 fs::write(tmp.join("ok/c.pdf"), b"ok").unwrap();
333
334 let mut found = Vec::new();
335 walk_for_pdfs(
336 from_ref(&tmp),
337 &mut |batch, _| found.extend_from_slice(batch),
338 &|| false,
339 None,
340 None,
341 None,
342 );
343 assert_eq!(found.len(), 1);
344 assert!(found[0].path.contains("/ok/"));
345 let _ = fs::remove_dir_all(&tmp);
346 }
347
348 #[test]
349 fn test_walk_for_pdfs_exclude_set() {
350 let tmp = std::env::temp_dir().join("upum_test_pdf_exclude");
351 let _ = fs::remove_dir_all(&tmp);
352 fs::create_dir_all(&tmp).unwrap();
353 fs::write(tmp.join("keep.pdf"), b"x").unwrap();
354 let skip = tmp.join("skip.pdf");
355 fs::write(&skip, b"x").unwrap();
356
357 let mut exclude = HashSet::new();
358 exclude.insert(skip.to_string_lossy().to_string());
359
360 let mut found = Vec::new();
361 walk_for_pdfs(
362 from_ref(&tmp),
363 &mut |batch, _| found.extend_from_slice(batch),
364 &|| false,
365 Some(exclude),
366 None,
367 None,
368 );
369 assert_eq!(found.len(), 1);
370 assert!(found[0].path.ends_with("keep.pdf"));
371 let _ = fs::remove_dir_all(&tmp);
372 }
373
374 #[test]
375 fn test_walk_for_pdfs_deduplicates_overlapping_roots() {
376 let tmp = std::env::temp_dir().join("upum_test_pdf_overlap");
377 let _ = fs::remove_dir_all(&tmp);
378 let child = tmp.join("sub");
379 fs::create_dir_all(&child).unwrap();
380 fs::write(child.join("overlap.pdf"), b"x").unwrap();
381 fs::write(tmp.join("top.pdf"), b"x").unwrap();
382
383 let mut found = Vec::new();
384 walk_for_pdfs(
385 &[tmp.clone(), child.clone()],
386 &mut |batch, _| found.extend_from_slice(batch),
387 &|| false,
388 None,
389 None,
390 None,
391 );
392 let overlap = found.iter().filter(|p| p.name == "overlap").count();
393 assert_eq!(overlap, 1);
394 assert!(found.iter().any(|p| p.name == "top"));
395 let _ = fs::remove_dir_all(&tmp);
396 }
397
398 #[test]
399 fn test_walk_for_pdfs_consistent_counts() {
400 let tmp = std::env::temp_dir().join("upum_test_pdf_consistent");
401 let _ = fs::remove_dir_all(&tmp);
402 for i in 0..5 {
403 let d = tmp.join(format!("d{i}"));
404 fs::create_dir_all(&d).unwrap();
405 fs::write(d.join(format!("p{i}.pdf")), b"x").unwrap();
406 }
407 let mut a = 0;
408 walk_for_pdfs(
409 &[tmp.clone()],
410 &mut |b, _| a += b.len(),
411 &|| false,
412 None,
413 None,
414 None,
415 );
416 let mut b = 0;
417 walk_for_pdfs(
418 &[tmp.clone()],
419 &mut |b2, _| b += b2.len(),
420 &|| false,
421 None,
422 None,
423 None,
424 );
425 assert_eq!(a, b);
426 assert_eq!(a, 5);
427 let _ = fs::remove_dir_all(&tmp);
428 }
429}