app_lib/
pdf_scanner.rs

1//! PDF file scanner.
2//!
3//! Discovers PDF files across user document directories. Supports parallel
4//! traversal and stop signaling (mirrors preset_scanner.rs structure).
5//! Symlinks are followed so link targets are scanned.
6
7use crate::history::PdfFile;
8use crate::scanner_skip_dirs::SCANNER_SKIP_DIRS as SKIP_DIRS;
9use crate::unified_walker::IncrementalDirState;
10use rayon::prelude::*;
11use dashmap::DashSet;
12use std::collections::HashSet;
13use std::fs;
14use std::path::{Path, PathBuf};
15use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
16use std::sync::{Arc, Mutex};
17
18fn normalize_macos_path(p: PathBuf) -> PathBuf {
19    #[cfg(target_os = "macos")]
20    {
21        let s = p.to_string_lossy();
22        if s.starts_with("/System/Volumes/Data/") {
23            return PathBuf::from(&s["/System/Volumes/Data".len()..]);
24        }
25    }
26    p
27}
28
29const PDF_EXTENSION: &str = ".pdf";
30
31fn format_size(bytes: u64) -> String {
32    crate::format_size(bytes)
33}
34
35/// Default PDF scan roots: user home only (`~`). Add more via Settings → PDF scan directories.
36pub fn get_pdf_roots() -> Vec<PathBuf> {
37    let home = dirs::home_dir().unwrap_or_default();
38    if home.as_os_str().is_empty() || !home.exists() {
39        return Vec::new();
40    }
41    vec![home]
42}
43
44pub fn walk_for_pdfs(
45    roots: &[PathBuf],
46    on_batch: &mut dyn FnMut(&[PdfFile], usize),
47    should_stop: &(dyn Fn() -> bool + Sync),
48    exclude: Option<HashSet<String>>,
49    active_dirs: Option<Arc<Mutex<Vec<String>>>>,
50    incremental: Option<Arc<IncrementalDirState>>,
51) {
52    let batch_size = 100;
53    let stop = Arc::new(AtomicBool::new(false));
54    let found = Arc::new(AtomicUsize::new(0));
55    let active = active_dirs.unwrap_or_else(|| Arc::new(Mutex::new(Vec::new())));
56    let (tx, rx) = std::sync::mpsc::sync_channel::<Vec<PdfFile>>(256);
57    let visited = Arc::new(DashSet::new());
58    let exclude = Arc::new(exclude.unwrap_or_default());
59
60    let roots_owned: Vec<PathBuf> = roots.to_vec();
61    let stop2 = stop.clone();
62    let found2 = found.clone();
63    let incremental = incremental.clone();
64    let pool = rayon::ThreadPoolBuilder::new()
65        .num_threads(num_cpus::get().max(4))
66        .build()
67        .unwrap();
68    std::thread::spawn(move || {
69        pool.install(|| {
70            roots_owned.par_iter().for_each(|root| {
71                if stop2.load(Ordering::Relaxed) {
72                    return;
73                }
74                walk_dir_parallel(
75                    root,
76                    0,
77                    &visited,
78                    &tx,
79                    &found2,
80                    batch_size,
81                    &stop2,
82                    &exclude,
83                    &active,
84                    incremental.clone(),
85                );
86            });
87        });
88        drop(pool);
89    });
90
91    let mut total_found = 0usize;
92    loop {
93        if should_stop() {
94            stop.store(true, Ordering::Relaxed);
95            while rx.try_recv().is_ok() {}
96            break;
97        }
98        match rx.recv_timeout(std::time::Duration::from_millis(10)) {
99            Ok(pdfs) => {
100                total_found += pdfs.len();
101                on_batch(&pdfs, total_found);
102            }
103            Err(std::sync::mpsc::RecvTimeoutError::Timeout) => continue,
104            Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => break,
105        }
106    }
107}
108
109#[allow(clippy::too_many_arguments)]
110fn walk_dir_parallel(
111    dir: &Path,
112    depth: u32,
113    visited: &Arc<DashSet<PathBuf>>,
114    tx: &std::sync::mpsc::SyncSender<Vec<PdfFile>>,
115    found: &Arc<AtomicUsize>,
116    batch_size: usize,
117    stop: &Arc<AtomicBool>,
118    exclude: &Arc<HashSet<String>>,
119    active_dirs: &Arc<Mutex<Vec<String>>>,
120    incremental: Option<Arc<IncrementalDirState>>,
121) {
122    if depth > 30 || stop.load(Ordering::Relaxed) {
123        return;
124    }
125
126    {
127        let orig = normalize_macos_path(dir.to_path_buf());
128        let canon = fs::canonicalize(dir).ok().map(normalize_macos_path);
129        let key = canon.unwrap_or_else(|| orig.clone());
130        if !visited.insert(key) {
131            return;
132        }
133        visited.insert(orig);
134    }
135
136    if let Some(ref inc) = incremental {
137        if inc.should_skip(dir) {
138            return;
139        }
140    }
141
142    let dir_str = dir.to_string_lossy().to_string();
143    {
144        let mut ad = active_dirs.lock().unwrap_or_else(|e| e.into_inner());
145        ad.push(dir_str.clone());
146        if ad.len() > 200 {
147            let excess = ad.len() - 200;
148            ad.drain(..excess);
149        }
150    }
151
152    let entries: Vec<_> = match fs::read_dir(dir) {
153        Ok(e) => e.flatten().collect(),
154        Err(_e) => {
155            return;
156        }
157    };
158
159    let mut files = Vec::new();
160    let mut subdirs = Vec::new();
161
162    for entry in &entries {
163        let name = entry.file_name();
164        let name_str = name.to_string_lossy();
165        // `@` prefix = Synology NAS system dirs (@eaDir, @tmp, @syno*, etc.).
166        if name_str.starts_with('.')
167            || name_str.starts_with('@')
168            || SKIP_DIRS.contains(&name_str.as_ref())
169            || exclude.contains(name_str.as_ref())
170        {
171            continue;
172        }
173        // Cached d_type from readdir — no extra stat() syscall per entry.
174        let ft = match entry.file_type() {
175            Ok(f) => f,
176            Err(_) => continue,
177        };
178        let path = entry.path();
179        if ft.is_dir() {
180            subdirs.push(path);
181        } else if ft.is_file() {
182            files.push((path, dir.to_path_buf()));
183        } else if ft.is_symlink() {
184            match fs::metadata(&path) {
185                Ok(m) if m.is_dir() => {
186                    subdirs.push(path);
187                }
188                Ok(m) if m.is_file() => {
189                    files.push((path, dir.to_path_buf()));
190                }
191                _ => {}
192            }
193        }
194    }
195
196    let mut batch = Vec::new();
197    for (path, parent) in files {
198        let ext = path
199            .extension()
200            .map(|e| format!(".{}", e.to_string_lossy().to_lowercase()))
201            .unwrap_or_default();
202
203        if ext == PDF_EXTENSION {
204            let path_str = path.to_string_lossy().to_string();
205            if exclude.contains(&path_str) {
206                continue;
207            }
208            if let Ok(meta) = fs::metadata(&path) {
209                let pdf_name = path
210                    .file_stem()
211                    .map(|s| s.to_string_lossy().to_string())
212                    .unwrap_or_default();
213                let modified = meta
214                    .modified()
215                    .ok()
216                    .map(|t| {
217                        let dt: chrono::DateTime<chrono::Utc> = t.into();
218                        dt.format("%Y-%m-%d").to_string()
219                    })
220                    .unwrap_or_default();
221
222                batch.push(PdfFile {
223                    name: pdf_name,
224                    path: path_str,
225                    directory: parent.to_string_lossy().to_string(),
226                    size: meta.len(),
227                    size_formatted: format_size(meta.len()),
228                    modified,
229                });
230                found.fetch_add(1, Ordering::Relaxed);
231
232                if batch.len() >= batch_size {
233                    let _ = tx.send(batch);
234                    batch = Vec::new();
235                }
236            }
237        }
238    }
239    if !batch.is_empty() {
240        let _ = tx.send(batch);
241    }
242
243    subdirs.par_iter().for_each(|subdir| {
244        walk_dir_parallel(
245            subdir,
246            depth + 1,
247            visited,
248            tx,
249            found,
250            batch_size,
251            stop,
252            exclude,
253            active_dirs,
254            incremental.clone(),
255        );
256    });
257
258    if let Some(ref inc) = incremental {
259        inc.record_scanned_dir(dir);
260    }
261}
262
263#[cfg(test)]
264mod tests {
265    use super::*;
266    use std::slice::from_ref;
267
268    #[test]
269    fn test_pdf_extension_constant() {
270        assert_eq!(PDF_EXTENSION, ".pdf");
271    }
272
273    #[test]
274    fn test_get_pdf_roots_returns_existing_paths() {
275        let roots = get_pdf_roots();
276        for r in &roots {
277            assert!(r.exists(), "returned root should exist: {:?}", r);
278        }
279    }
280
281    #[test]
282    fn test_walk_for_pdfs_empty_dir() {
283        let tmp = std::env::temp_dir().join("upum_test_pdf_empty");
284        let _ = fs::remove_dir_all(&tmp);
285        fs::create_dir_all(&tmp).unwrap();
286        let mut found = Vec::new();
287        walk_for_pdfs(
288            from_ref(&tmp),
289            &mut |batch, _| found.extend_from_slice(batch),
290            &|| false,
291            None,
292            None,
293            None,
294        );
295        assert!(found.is_empty());
296        let _ = fs::remove_dir_all(&tmp);
297    }
298
299    #[test]
300    fn test_walk_for_pdfs_finds_files() {
301        let tmp = std::env::temp_dir().join("upum_test_pdf_find");
302        let _ = fs::remove_dir_all(&tmp);
303        fs::create_dir_all(&tmp).unwrap();
304        fs::write(tmp.join("manual.pdf"), b"%PDF-1.4").unwrap();
305        fs::write(tmp.join("book.PDF"), b"%PDF-1.4").unwrap();
306        fs::write(tmp.join("notes.txt"), b"nope").unwrap();
307
308        let mut found = Vec::new();
309        walk_for_pdfs(
310            from_ref(&tmp),
311            &mut |batch, _| found.extend_from_slice(batch),
312            &|| false,
313            None,
314            None,
315            None,
316        );
317        assert_eq!(found.len(), 2);
318        assert!(found.iter().any(|p| p.name == "manual"));
319        assert!(found.iter().any(|p| p.name == "book"));
320        let _ = fs::remove_dir_all(&tmp);
321    }
322
323    #[test]
324    fn test_walk_for_pdfs_skips_hidden_and_blacklisted() {
325        let tmp = std::env::temp_dir().join("upum_test_pdf_skip");
326        let _ = fs::remove_dir_all(&tmp);
327        fs::create_dir_all(tmp.join(".hidden")).unwrap();
328        fs::create_dir_all(tmp.join("node_modules")).unwrap();
329        fs::create_dir_all(tmp.join("ok")).unwrap();
330        fs::write(tmp.join(".hidden/a.pdf"), b"h").unwrap();
331        fs::write(tmp.join("node_modules/b.pdf"), b"n").unwrap();
332        fs::write(tmp.join("ok/c.pdf"), b"ok").unwrap();
333
334        let mut found = Vec::new();
335        walk_for_pdfs(
336            from_ref(&tmp),
337            &mut |batch, _| found.extend_from_slice(batch),
338            &|| false,
339            None,
340            None,
341            None,
342        );
343        assert_eq!(found.len(), 1);
344        assert!(found[0].path.contains("/ok/"));
345        let _ = fs::remove_dir_all(&tmp);
346    }
347
348    #[test]
349    fn test_walk_for_pdfs_exclude_set() {
350        let tmp = std::env::temp_dir().join("upum_test_pdf_exclude");
351        let _ = fs::remove_dir_all(&tmp);
352        fs::create_dir_all(&tmp).unwrap();
353        fs::write(tmp.join("keep.pdf"), b"x").unwrap();
354        let skip = tmp.join("skip.pdf");
355        fs::write(&skip, b"x").unwrap();
356
357        let mut exclude = HashSet::new();
358        exclude.insert(skip.to_string_lossy().to_string());
359
360        let mut found = Vec::new();
361        walk_for_pdfs(
362            from_ref(&tmp),
363            &mut |batch, _| found.extend_from_slice(batch),
364            &|| false,
365            Some(exclude),
366            None,
367            None,
368        );
369        assert_eq!(found.len(), 1);
370        assert!(found[0].path.ends_with("keep.pdf"));
371        let _ = fs::remove_dir_all(&tmp);
372    }
373
374    #[test]
375    fn test_walk_for_pdfs_deduplicates_overlapping_roots() {
376        let tmp = std::env::temp_dir().join("upum_test_pdf_overlap");
377        let _ = fs::remove_dir_all(&tmp);
378        let child = tmp.join("sub");
379        fs::create_dir_all(&child).unwrap();
380        fs::write(child.join("overlap.pdf"), b"x").unwrap();
381        fs::write(tmp.join("top.pdf"), b"x").unwrap();
382
383        let mut found = Vec::new();
384        walk_for_pdfs(
385            &[tmp.clone(), child.clone()],
386            &mut |batch, _| found.extend_from_slice(batch),
387            &|| false,
388            None,
389            None,
390            None,
391        );
392        let overlap = found.iter().filter(|p| p.name == "overlap").count();
393        assert_eq!(overlap, 1);
394        assert!(found.iter().any(|p| p.name == "top"));
395        let _ = fs::remove_dir_all(&tmp);
396    }
397
398    #[test]
399    fn test_walk_for_pdfs_consistent_counts() {
400        let tmp = std::env::temp_dir().join("upum_test_pdf_consistent");
401        let _ = fs::remove_dir_all(&tmp);
402        for i in 0..5 {
403            let d = tmp.join(format!("d{i}"));
404            fs::create_dir_all(&d).unwrap();
405            fs::write(d.join(format!("p{i}.pdf")), b"x").unwrap();
406        }
407        let mut a = 0;
408        walk_for_pdfs(
409            &[tmp.clone()],
410            &mut |b, _| a += b.len(),
411            &|| false,
412            None,
413            None,
414            None,
415        );
416        let mut b = 0;
417        walk_for_pdfs(
418            &[tmp.clone()],
419            &mut |b2, _| b += b2.len(),
420            &|| false,
421            None,
422            None,
423            None,
424        );
425        assert_eq!(a, b);
426        assert_eq!(a, 5);
427        let _ = fs::remove_dir_all(&tmp);
428    }
429}