app_lib/
pdf_meta.rs

1//! PDF metadata extraction (page count).
2//!
3//! Uses `lopdf` to read the document catalog and return the page count.
4//! Designed for bulk extraction — returns `None` on any parse error so
5//! one bad file doesn't stop a batch job.
6
7use rayon::prelude::*;
8
9/// Page count for a single PDF. Returns None if the file can't be parsed.
10pub fn extract_page_count(path: &str) -> Option<u32> {
11    let doc = lopdf::Document::load(path).ok()?;
12    Some(doc.get_pages().len() as u32)
13}
14
15/// Batch page-count extraction with parallel parsing. Returns (path, pages) pairs
16/// only for PDFs that parsed successfully.
17pub fn extract_pages_batch(paths: &[String]) -> Vec<(String, u32)> {
18    paths
19        .par_iter()
20        .filter_map(|p| extract_page_count(p).map(|n| (p.clone(), n)))
21        .collect()
22}
23
24#[cfg(test)]
25mod tests {
26    use super::*;
27
28    #[test]
29    fn extract_pages_missing_file_returns_none() {
30        assert!(extract_page_count("/nonexistent/file.pdf").is_none());
31    }
32
33    #[test]
34    fn extract_pages_not_a_pdf_returns_none() {
35        let tmp = std::env::temp_dir().join("upum_not_a_pdf.pdf");
36        std::fs::write(&tmp, b"this is not a pdf").unwrap();
37        let res = extract_page_count(tmp.to_str().unwrap());
38        let _ = std::fs::remove_file(&tmp);
39        assert!(res.is_none());
40    }
41
42    #[test]
43    fn extract_pages_batch_skips_bad_files() {
44        let paths = vec![
45            "/nonexistent/a.pdf".to_string(),
46            "/nonexistent/b.pdf".to_string(),
47        ];
48        let result = extract_pages_batch(&paths);
49        assert!(result.is_empty());
50    }
51
52    /// printpdf emits a valid file; lopdf must agree on page count (regression for bulk PDF indexing).
53    #[test]
54    fn extract_page_count_matches_printpdf_three_pages() {
55        use printpdf::{Mm, Op, PdfDocument, PdfPage, PdfSaveOptions};
56        use std::fs::File;
57        use std::io::BufWriter;
58
59        let tmp =
60            std::env::temp_dir().join(format!("ah_pdf_meta_three_{}.pdf", std::process::id()));
61        let mut doc = PdfDocument::new("pdf_meta_test");
62        let p1 = PdfPage::new(Mm(40.0), Mm(40.0), vec![Op::SaveGraphicsState, Op::RestoreGraphicsState]);
63        let p2 = PdfPage::new(Mm(40.0), Mm(40.0), vec![Op::SaveGraphicsState, Op::RestoreGraphicsState]);
64        let p3 = PdfPage::new(Mm(40.0), Mm(40.0), vec![Op::SaveGraphicsState, Op::RestoreGraphicsState]);
65        doc.with_pages(vec![p1, p2, p3]);
66        let bytes = doc.save(&PdfSaveOptions::default(), &mut Vec::new());
67        std::io::Write::write_all(
68            &mut BufWriter::new(File::create(&tmp).expect("temp pdf create")),
69            &bytes,
70        )
71        .expect("printpdf save");
72
73        let n = extract_page_count(tmp.to_str().unwrap());
74        let _ = std::fs::remove_file(&tmp);
75        assert_eq!(n, Some(3));
76    }
77
78    #[test]
79    fn extract_pages_batch_merges_valid_paths() {
80        use printpdf::{Mm, Op, PdfDocument, PdfPage, PdfSaveOptions};
81        use std::fs::File;
82        use std::io::BufWriter;
83
84        let id = std::process::id();
85        let a = std::env::temp_dir().join(format!("ah_pdf_batch_a_{id}.pdf"));
86        let b = std::env::temp_dir().join(format!("ah_pdf_batch_b_{id}.pdf"));
87
88        let mut doc_a = PdfDocument::new("a");
89        doc_a.with_pages(vec![PdfPage::new(
90            Mm(30.0),
91            Mm(30.0),
92            vec![Op::SaveGraphicsState, Op::RestoreGraphicsState],
93        )]);
94        let bytes = doc_a.save(&PdfSaveOptions::default(), &mut Vec::new());
95        std::io::Write::write_all(&mut BufWriter::new(File::create(&a).unwrap()), &bytes)
96            .expect("save a");
97
98        let mut doc_b = PdfDocument::new("b");
99        doc_b.with_pages(vec![
100            PdfPage::new(
101                Mm(30.0),
102                Mm(30.0),
103                vec![Op::SaveGraphicsState, Op::RestoreGraphicsState],
104            ),
105            PdfPage::new(
106                Mm(30.0),
107                Mm(30.0),
108                vec![Op::SaveGraphicsState, Op::RestoreGraphicsState],
109            ),
110        ]);
111        let bytes = doc_b.save(&PdfSaveOptions::default(), &mut Vec::new());
112        std::io::Write::write_all(&mut BufWriter::new(File::create(&b).unwrap()), &bytes)
113            .expect("save b");
114
115        let paths = vec![
116            a.to_string_lossy().into_owned(),
117            b.to_string_lossy().into_owned(),
118            "/totally/missing/xyz.pdf".to_string(),
119        ];
120        let mut pairs = extract_pages_batch(&paths);
121        pairs.sort_by(|x, y| x.0.cmp(&y.0));
122
123        let _ = std::fs::remove_file(&a);
124        let _ = std::fs::remove_file(&b);
125
126        assert_eq!(pairs.len(), 2);
127        assert!(pairs.iter().any(|(_, n)| *n == 1));
128        assert!(pairs.iter().any(|(_, n)| *n == 2));
129    }
130}