app_lib/
bulk_stat.rs

1//! Bulk directory enumeration + metadata fetch in a single syscall.
2//!
3//! ## Why
4//! Standard `fs::read_dir` returns just names + d_type. To get size/mtime we
5//! call `fs::metadata()` per file — one `stat(2)` syscall each. On SMB/NFS
6//! every syscall = network roundtrip (1-10ms LAN, 50-200ms WAN). Walking a
7//! 100k-file share spends 200+ seconds in per-file stats.
8//!
9//! macOS has `getattrlistbulk(2)` which returns metadata for an entire
10//! directory in one syscall. `find(1)` and `fts(3)` use it. For SMB it's
11//! a 10-100× speedup on metadata cost.
12//!
13//! Other platforms (Linux, Windows) fall back to readdir+metadata. Linux
14//! has no direct equivalent — the closest is `statx` which is still per-file.
15//!
16//! ## Usage
17//! Call `read_dir_bulk(path)` instead of `fs::read_dir(path)` when you need
18//! type + size + mtime per entry. Returns a `Vec<BulkEntry>` with all
19//! metadata already populated. If the platform-specific fast path fails
20//! (unsupported filesystem, permission error, etc.) it falls back to the
21//! portable `fs::read_dir + metadata` path transparently.
22
23use std::fs;
24use std::io;
25use std::path::{Path, PathBuf};
26
27#[derive(Debug, Clone)]
28pub struct BulkEntry {
29    pub name: String,
30    pub path: PathBuf,
31    pub is_dir: bool,
32    pub is_file: bool,
33    pub is_symlink: bool,
34    pub size: u64,
35    /// Modified time as seconds since UNIX epoch. 0 if unavailable.
36    pub mtime_secs: i64,
37}
38
39/// Enumerate `dir` returning name + type + size + mtime for every entry in
40/// a single syscall where possible. Hidden `.` and `..` entries are excluded.
41pub fn read_dir_bulk(dir: &Path) -> io::Result<Vec<BulkEntry>> {
42    #[cfg(target_os = "macos")]
43    {
44        match macos::read_dir_bulk_fast(dir) {
45            Ok(v) => return Ok(v),
46            Err(e) => {
47                // Fall back to portable path on error (unsupported FS, etc.).
48                // Log at debug level so production noise stays low.
49                let _ = e;
50            }
51        }
52    }
53    read_dir_bulk_portable(dir)
54}
55
56/// Portable fallback: readdir + per-entry metadata. One syscall per entry.
57fn read_dir_bulk_portable(dir: &Path) -> io::Result<Vec<BulkEntry>> {
58    let entries = fs::read_dir(dir)?;
59    let mut out = Vec::new();
60    for entry in entries.flatten() {
61        let name = entry.file_name().to_string_lossy().to_string();
62        let path = entry.path();
63        let ft = match entry.file_type() {
64            Ok(f) => f,
65            Err(_) => continue,
66        };
67        // Stat only files — stats are syscalls on Unix. Directory mtimes are
68        // left as 0 on the portable path; callers that need dir mtime (DAW
69        // packages) must stat those specific directories themselves. On
70        // Windows, DirEntry::metadata() is free (cached from FindFirstFileW)
71        // so stat'ing dirs there would be cheap, but we keep the logic
72        // uniform across portable platforms.
73        let (size, mtime_secs) = if ft.is_file() {
74            match entry.metadata() {
75                Ok(m) => (
76                    m.len(),
77                    m.modified()
78                        .ok()
79                        .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
80                        .map(|d| d.as_secs() as i64)
81                        .unwrap_or(0),
82                ),
83                Err(_) => (0, 0),
84            }
85        } else {
86            (0, 0)
87        };
88        out.push(BulkEntry {
89            name,
90            path,
91            is_dir: ft.is_dir(),
92            is_file: ft.is_file(),
93            is_symlink: ft.is_symlink(),
94            size,
95            mtime_secs,
96        });
97    }
98    Ok(out)
99}
100
101#[cfg(target_os = "macos")]
102mod macos {
103    //! macOS-specific bulk metadata via `getattrlistbulk(2)`.
104    //!
105    //! Layout reference: the man page at `man 2 getattrlistbulk`.
106    //! Each returned entry is: u32 total_length, then each requested attr
107    //! packed in the order it appears in the attrlist bitmap.
108    use super::BulkEntry;
109    use std::ffi::{CStr, CString};
110    use std::io;
111    use std::mem;
112    use std::os::raw::{c_int, c_void};
113    use std::path::{Path, PathBuf};
114
115    // attrlist bitmap count
116    const ATTR_BIT_MAP_COUNT: u16 = 5;
117
118    // common attributes (attrgroup_t bits)
119    const ATTR_CMN_RETURNED_ATTRS: u32 = 0x80000000;
120    const ATTR_CMN_NAME: u32 = 0x00000001;
121    const ATTR_CMN_OBJTYPE: u32 = 0x00000008;
122    const ATTR_CMN_MODTIME: u32 = 0x00000400;
123
124    // file attributes
125    const ATTR_FILE_DATALENGTH: u32 = 0x00000200;
126
127    // FSOPT: pack placeholders for requested-but-unavailable attrs so the
128    // layout stays predictable.
129    const FSOPT_PACK_INVAL_ATTRS: u64 = 0x00000008;
130
131    // vnode_type_t / fsobj_type_t values
132    const VREG: u32 = 1;
133    const VDIR: u32 = 2;
134    const VLNK: u32 = 5;
135
136    #[repr(C)]
137    struct Attrlist {
138        bitmapcount: u16,
139        reserved: u16,
140        commonattr: u32,
141        volattr: u32,
142        dirattr: u32,
143        fileattr: u32,
144        forkattr: u32,
145    }
146
147    #[repr(C)]
148    struct AttributeSet {
149        commonattr: u32,
150        volattr: u32,
151        dirattr: u32,
152        fileattr: u32,
153        forkattr: u32,
154    }
155    // Layout notes: AttrReference (int32 offset + u32 length, 8 bytes) and
156    // Timespec (2 x i64, 16 bytes) are parsed manually via byte slicing — no
157    // Rust structs needed for them.
158
159    unsafe extern "C" {
160        fn getattrlistbulk(
161            dirfd: c_int,
162            alist: *mut c_void,
163            attrbuf: *mut c_void,
164            bufsize: usize,
165            options: u64,
166        ) -> c_int;
167    }
168
169    pub fn read_dir_bulk_fast(dir: &Path) -> io::Result<Vec<BulkEntry>> {
170        let cpath = CString::new(dir.as_os_str().to_string_lossy().as_bytes())
171            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
172        // SAFETY: libc FFI. Opens dir for reading with O_DIRECTORY flag.
173        let fd = unsafe {
174            libc::open(
175                cpath.as_ptr(),
176                libc::O_RDONLY | libc::O_DIRECTORY | libc::O_CLOEXEC,
177            )
178        };
179        if fd < 0 {
180            return Err(io::Error::last_os_error());
181        }
182        // Close fd on drop.
183        struct FdGuard(c_int);
184        impl Drop for FdGuard {
185            fn drop(&mut self) {
186                unsafe {
187                    libc::close(self.0);
188                }
189            }
190        }
191        let _guard = FdGuard(fd);
192
193        let mut alist: Attrlist = unsafe { mem::zeroed() };
194        alist.bitmapcount = ATTR_BIT_MAP_COUNT;
195        // ATTR_CMN_ERROR intentionally omitted — it's a flag on the returned
196        // attribute_set_t, not an inline u32 value in the data buffer.
197        alist.commonattr =
198            ATTR_CMN_RETURNED_ATTRS | ATTR_CMN_NAME | ATTR_CMN_OBJTYPE | ATTR_CMN_MODTIME;
199        alist.fileattr = ATTR_FILE_DATALENGTH;
200
201        // 64KB buffer ~= 200-500 entries per call depending on name lengths.
202        // Loop until getattrlistbulk returns 0 (no more entries).
203        const BUFSIZE: usize = 64 * 1024;
204        let mut buf = vec![0u8; BUFSIZE];
205        let mut out = Vec::new();
206
207        loop {
208            let n = unsafe {
209                getattrlistbulk(
210                    fd,
211                    &mut alist as *mut _ as *mut c_void,
212                    buf.as_mut_ptr() as *mut c_void,
213                    BUFSIZE,
214                    FSOPT_PACK_INVAL_ATTRS,
215                )
216            };
217            if n < 0 {
218                return Err(io::Error::last_os_error());
219            }
220            if n == 0 {
221                break;
222            }
223            let n = n as usize;
224            let mut cursor = 0usize;
225            for _ in 0..n {
226                let entry_start = cursor;
227                if cursor + 4 > buf.len() {
228                    return Err(io::Error::new(
229                        io::ErrorKind::InvalidData,
230                        "getattrlistbulk returned truncated entry length",
231                    ));
232                }
233                // First field: u32 total entry length (includes this length field).
234                let total_len = u32::from_ne_bytes(
235                    buf[cursor..cursor + 4]
236                        .try_into()
237                        .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "slice"))?,
238                ) as usize;
239                cursor += 4;
240
241                // Next: attribute_set_t (5 x u32) — the actually-returned attrs.
242                let returned = AttributeSet {
243                    commonattr: read_u32(&buf, &mut cursor)?,
244                    volattr: read_u32(&buf, &mut cursor)?,
245                    dirattr: read_u32(&buf, &mut cursor)?,
246                    fileattr: read_u32(&buf, &mut cursor)?,
247                    forkattr: read_u32(&buf, &mut cursor)?,
248                };
249
250                // ATTR_CMN_NAME: attrreference_t (offset is relative to start of
251                // the attrreference itself, NOT the entry).
252                let mut name = String::new();
253                if returned.commonattr & ATTR_CMN_NAME != 0 {
254                    let ref_pos = cursor;
255                    if cursor + 8 > buf.len() {
256                        return Err(io::Error::new(
257                            io::ErrorKind::InvalidData,
258                            "attrreference oob",
259                        ));
260                    }
261                    let offset = i32::from_ne_bytes(
262                        buf[cursor..cursor + 4]
263                            .try_into()
264                            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "slice"))?,
265                    );
266                    let length = u32::from_ne_bytes(
267                        buf[cursor + 4..cursor + 8]
268                            .try_into()
269                            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "slice"))?,
270                    );
271                    cursor += 8;
272                    let name_start = (ref_pos as isize + offset as isize) as usize;
273                    let name_end = name_start + length as usize;
274                    if name_end > buf.len() || name_start >= buf.len() {
275                        return Err(io::Error::new(
276                            io::ErrorKind::InvalidData,
277                            "name offset oob",
278                        ));
279                    }
280                    // length includes trailing NUL byte.
281                    let cstr = CStr::from_bytes_until_nul(&buf[name_start..name_end])
282                        .unwrap_or(CStr::from_bytes_with_nul(b"\0").unwrap());
283                    name = cstr.to_string_lossy().into_owned();
284                }
285
286                // ATTR_CMN_OBJTYPE: u32 fsobj_type
287                let mut objtype: u32 = 0;
288                if returned.commonattr & ATTR_CMN_OBJTYPE != 0 {
289                    objtype = read_u32(&buf, &mut cursor)?;
290                }
291
292                // ATTR_CMN_MODTIME: struct timespec (8-aligned)
293                let mut mtime_secs: i64 = 0;
294                if returned.commonattr & ATTR_CMN_MODTIME != 0 {
295                    // timespec is 16 bytes, 8-byte aligned. Cursor should already
296                    // be aligned because packed(4) plus preceding fields sum right.
297                    if cursor + 16 > buf.len() {
298                        return Err(io::Error::new(io::ErrorKind::InvalidData, "timespec oob"));
299                    }
300                    mtime_secs = i64::from_ne_bytes(
301                        buf[cursor..cursor + 8]
302                            .try_into()
303                            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "slice"))?,
304                    );
305                    cursor += 16;
306                }
307
308                // Note: ATTR_CMN_ERROR is NOT packed into the buffer inline.
309                // It only appears as a flag in the `returned` bitmap — when
310                // nonzero per-entry errors occur, the value lives elsewhere in
311                // the returned attribute set (not the data stream). Empirically
312                // on macOS 14/15, requesting ATTR_CMN_ERROR does not consume
313                // bytes in the output buffer for successful entries.
314
315                // ATTR_FILE_DATALENGTH: off_t (i64) — only valid for regular files
316                let mut size: u64 = 0;
317                if returned.fileattr & ATTR_FILE_DATALENGTH != 0 {
318                    if cursor + 8 > buf.len() {
319                        return Err(io::Error::new(io::ErrorKind::InvalidData, "datalength oob"));
320                    }
321                    let sz = i64::from_ne_bytes(
322                        buf[cursor..cursor + 8]
323                            .try_into()
324                            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "slice"))?,
325                    );
326                    size = sz.max(0) as u64;
327                    // cursor advance not needed — we jump to entry boundary below.
328                }
329
330                // Advance cursor to the declared entry boundary — handles any
331                // padding attrs we didn't consume.
332                cursor = entry_start + total_len;
333
334                if name.is_empty() || name == "." || name == ".." {
335                    continue;
336                }
337                let is_dir = objtype == VDIR;
338                let is_file = objtype == VREG;
339                let is_symlink = objtype == VLNK;
340                let path = dir.join(&name);
341                out.push(BulkEntry {
342                    name,
343                    path,
344                    is_dir,
345                    is_file,
346                    is_symlink,
347                    size: if is_file { size } else { 0 },
348                    mtime_secs,
349                });
350            }
351        }
352
353        Ok(out)
354    }
355
356    fn read_u32(buf: &[u8], cursor: &mut usize) -> io::Result<u32> {
357        if *cursor + 4 > buf.len() {
358            return Err(io::Error::new(io::ErrorKind::InvalidData, "u32 oob"));
359        }
360        let v = u32::from_ne_bytes(
361            buf[*cursor..*cursor + 4]
362                .try_into()
363                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "slice"))?,
364        );
365        *cursor += 4;
366        Ok(v)
367    }
368
369    #[allow(dead_code)]
370    fn _path_buf_unused() -> PathBuf {
371        PathBuf::new()
372    }
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378    use std::fs::File;
379    use std::io::Write;
380
381    struct TestDir {
382        path: PathBuf,
383    }
384    impl TestDir {
385        fn new(name: &str) -> Self {
386            let path = std::env::temp_dir().join(format!(
387                "upum_bs_{}_{}",
388                name,
389                std::time::SystemTime::now()
390                    .duration_since(std::time::UNIX_EPOCH)
391                    .unwrap()
392                    .as_nanos()
393            ));
394            let _ = fs::remove_dir_all(&path);
395            fs::create_dir_all(&path).unwrap();
396            Self { path }
397        }
398    }
399    impl Drop for TestDir {
400        fn drop(&mut self) {
401            let _ = fs::remove_dir_all(&self.path);
402        }
403    }
404
405    fn touch_with(p: &Path, content: &[u8]) {
406        if let Some(parent) = p.parent() {
407            fs::create_dir_all(parent).unwrap();
408        }
409        let mut f = File::create(p).unwrap();
410        f.write_all(content).unwrap();
411    }
412
413    #[test]
414    fn test_read_dir_bulk_basic() {
415        let tmp = TestDir::new("basic");
416        touch_with(&tmp.path.join("a.txt"), b"hello");
417        touch_with(&tmp.path.join("b.dat"), b"0123456789");
418        fs::create_dir_all(tmp.path.join("sub")).unwrap();
419
420        let entries = read_dir_bulk(&tmp.path).unwrap();
421        assert_eq!(entries.len(), 3);
422        let by_name: std::collections::HashMap<_, _> =
423            entries.iter().map(|e| (e.name.clone(), e)).collect();
424        let a = by_name.get("a.txt").expect("a.txt missing");
425        assert!(a.is_file);
426        assert!(!a.is_dir);
427        assert_eq!(a.size, 5);
428        let b = by_name.get("b.dat").expect("b.dat missing");
429        assert_eq!(b.size, 10);
430        let sub = by_name.get("sub").expect("sub missing");
431        assert!(sub.is_dir);
432        assert!(!sub.is_file);
433    }
434
435    #[test]
436    fn test_read_dir_bulk_empty() {
437        let tmp = TestDir::new("empty");
438        let entries = read_dir_bulk(&tmp.path).unwrap();
439        assert_eq!(entries.len(), 0);
440    }
441
442    #[test]
443    fn test_read_dir_bulk_excludes_dot_entries() {
444        let tmp = TestDir::new("dotentries");
445        touch_with(&tmp.path.join("real.txt"), b"x");
446        let entries = read_dir_bulk(&tmp.path).unwrap();
447        let names: Vec<_> = entries.iter().map(|e| e.name.as_str()).collect();
448        assert!(!names.contains(&"."));
449        assert!(!names.contains(&".."));
450        assert!(names.contains(&"real.txt"));
451    }
452
453    #[test]
454    fn test_read_dir_bulk_nonexistent_dir() {
455        let tmp = TestDir::new("nonexistent");
456        let missing = tmp.path.join("does-not-exist");
457        let result = read_dir_bulk(&missing);
458        assert!(result.is_err());
459    }
460
461    #[test]
462    fn test_read_dir_bulk_many_entries() {
463        // Ensure the getattrlistbulk loop handles multiple buffer passes.
464        let tmp = TestDir::new("many");
465        for i in 0..250 {
466            touch_with(&tmp.path.join(format!("file_{:04}.txt", i)), b"x");
467        }
468        let entries = read_dir_bulk(&tmp.path).unwrap();
469        assert_eq!(entries.len(), 250);
470        // All should be files with size 1.
471        assert!(entries.iter().all(|e| e.is_file && e.size == 1));
472    }
473
474    #[test]
475    fn test_read_dir_bulk_mtime_populated() {
476        let tmp = TestDir::new("mtime");
477        touch_with(&tmp.path.join("x.txt"), b"y");
478        let entries = read_dir_bulk(&tmp.path).unwrap();
479        let x = entries.iter().find(|e| e.name == "x.txt").unwrap();
480        // mtime should be near "now" (within the last 60 seconds).
481        let now = std::time::SystemTime::now()
482            .duration_since(std::time::UNIX_EPOCH)
483            .unwrap()
484            .as_secs() as i64;
485        assert!(x.mtime_secs > 0, "mtime should be populated");
486        assert!(
487            (now - x.mtime_secs).abs() < 60,
488            "mtime {} should be near now {}",
489            x.mtime_secs,
490            now
491        );
492    }
493
494    #[test]
495    fn test_read_dir_bulk_matches_portable() {
496        // The bulk path and the portable path should produce identical
497        // classification + sizes. (macOS-specific — on other platforms both
498        // paths ARE the portable one.)
499        let tmp = TestDir::new("parity");
500        touch_with(&tmp.path.join("a.wav"), b"RIFF1234");
501        touch_with(&tmp.path.join("b.pdf"), b"%PDF-1.4");
502        fs::create_dir_all(tmp.path.join("dir1")).unwrap();
503        let bulk = read_dir_bulk(&tmp.path).unwrap();
504        let portable = read_dir_bulk_portable(&tmp.path).unwrap();
505        assert_eq!(bulk.len(), portable.len());
506        let bulk_names: std::collections::HashSet<_> =
507            bulk.iter().map(|e| e.name.clone()).collect();
508        let portable_names: std::collections::HashSet<_> =
509            portable.iter().map(|e| e.name.clone()).collect();
510        assert_eq!(bulk_names, portable_names);
511        // Sizes should match for files (bulk populates from bulk syscall,
512        // portable populates from per-file metadata).
513        for b in &bulk {
514            if b.is_file {
515                let p = portable.iter().find(|e| e.name == b.name).unwrap();
516                assert_eq!(
517                    b.size, p.size,
518                    "size mismatch for {}: bulk={} portable={}",
519                    b.name, b.size, p.size
520                );
521            }
522        }
523    }
524}