app_lib/
content_hash.rs

1//! Byte-level duplicate detection: SHA-256 over file contents.
2//!
3//! Groups paths by stored size first (from SQLite), then hashes only size buckets
4//! with more than one path.
5
6use serde::Serialize;
7use sha2::{Digest, Sha256};
8use std::collections::HashMap;
9use std::fs::File;
10use std::io::{BufReader, Read};
11use std::path::Path;
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::Arc;
14
15use rayon::prelude::*;
16use tauri::{AppHandle, Emitter};
17
18const READ_CHUNK: usize = 1024 * 1024;
19
20/// Hex-encoded SHA-256 of file bytes, or `None` if unreadable.
21pub fn hash_file_sha256(path: &Path) -> Option<String> {
22    let file = File::open(path).ok()?;
23    let mut reader = BufReader::with_capacity(READ_CHUNK, file);
24    let mut hasher = Sha256::new();
25    let mut buf = vec![0u8; READ_CHUNK];
26    loop {
27        let n = reader.read(&mut buf).ok()?;
28        if n == 0 {
29            break;
30        }
31        hasher.update(&buf[..n]);
32    }
33    Some(format!("{:x}", hasher.finalize()))
34}
35
36#[derive(Debug, Clone, Serialize)]
37pub struct ContentDupPath {
38    pub path: String,
39    /// Short domain tag: `plugins`, `audio`, `daw`, `presets`, `pdf`, `midi`.
40    pub kind: String,
41}
42
43#[derive(Debug, Serialize)]
44pub struct ContentDupGroup {
45    pub hash_hex: String,
46    pub size_bytes: u64,
47    pub paths: Vec<ContentDupPath>,
48}
49
50#[derive(Debug, Serialize)]
51pub struct ContentDupScanResult {
52    pub groups: Vec<ContentDupGroup>,
53    /// Files that were hashed (only candidates in multi-path size buckets).
54    pub files_hashed: usize,
55    /// Paths skipped (missing on disk or read error).
56    pub skipped: usize,
57}
58
59/// `entries`: `(path, size_bytes, kind)` for the whole library.
60pub fn find_byte_duplicate_groups(
61    entries: Vec<(String, u64, String)>,
62    progress: Option<(Arc<AppHandle>, usize)>,
63) -> ContentDupScanResult {
64    let mut size_map: HashMap<u64, Vec<(String, String)>> = HashMap::new();
65    for (path, sz, kind) in entries {
66        size_map.entry(sz).or_default().push((path, kind));
67    }
68
69    let mut to_hash: Vec<(String, String, u64)> = Vec::new();
70    for (sz, paths) in size_map {
71        if paths.len() < 2 {
72            continue;
73        }
74        for (p, k) in paths {
75            to_hash.push((p, k, sz));
76        }
77    }
78
79    let total = to_hash.len();
80    if total == 0 {
81        return ContentDupScanResult {
82            groups: vec![],
83            files_hashed: 0,
84            skipped: 0,
85        };
86    }
87
88    let done_ctr = AtomicUsize::new(0);
89    let skipped_ctr = AtomicUsize::new(0);
90
91    let hashed: Vec<(String, String, u64, String)> = to_hash
92        .into_par_iter()
93        .filter_map(|(path, kind, sz)| {
94            let p = Path::new(&path);
95            let h = match hash_file_sha256(p) {
96                Some(x) => x,
97                None => {
98                    skipped_ctr.fetch_add(1, Ordering::Relaxed);
99                    return None;
100                }
101            };
102            if let Some((app, every)) = progress.as_ref() {
103                let n = done_ctr.fetch_add(1, Ordering::Relaxed) + 1;
104                if *every > 0 && (n % *every == 0 || n == total) {
105                    let _ = app.emit(
106                        "content-dup-progress",
107                        serde_json::json!({ "done": n, "total": total }),
108                    );
109                }
110            } else {
111                done_ctr.fetch_add(1, Ordering::Relaxed);
112            }
113            Some((path, kind, sz, h))
114        })
115        .collect();
116
117    let mut by_hash: HashMap<String, Vec<(String, String, u64)>> = HashMap::new();
118    for (path, kind, sz, h) in hashed {
119        by_hash.entry(h).or_default().push((path, kind, sz));
120    }
121
122    let mut groups: Vec<ContentDupGroup> = by_hash
123        .into_iter()
124        .filter(|(_, paths)| paths.len() > 1)
125        .map(|(hash_hex, mut paths)| {
126            paths.sort_by(|a, b| a.0.cmp(&b.0));
127            let size_bytes = paths[0].2;
128            let paths = paths
129                .into_iter()
130                .map(|(path, kind, _)| ContentDupPath { path, kind })
131                .collect();
132            ContentDupGroup {
133                hash_hex,
134                size_bytes,
135                paths,
136            }
137        })
138        .collect();
139
140    groups.sort_by(|a, b| a.hash_hex.cmp(&b.hash_hex));
141
142    ContentDupScanResult {
143        files_hashed: total,
144        skipped: skipped_ctr.into_inner(),
145        groups,
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    fn test_dir(name: &str) -> std::path::PathBuf {
154        let p = std::env::temp_dir().join(format!(
155            "ah_content_hash_{}_{}",
156            std::process::id(),
157            name
158        ));
159        let _ = std::fs::create_dir_all(&p);
160        p
161    }
162
163    #[test]
164    fn identical_files_same_hash() {
165        let dir = test_dir("same");
166        let a = dir.join("a.bin");
167        let b = dir.join("b.bin");
168        std::fs::write(&a, b"hello").unwrap();
169        std::fs::write(&b, b"hello").unwrap();
170        assert_eq!(
171            hash_file_sha256(&a),
172            hash_file_sha256(&b),
173            "same bytes => same SHA-256"
174        );
175        let _ = std::fs::remove_dir_all(&dir);
176    }
177
178    #[test]
179    fn find_groups_two_identical() {
180        let dir = test_dir("dup");
181        let a = dir.join("a.wav");
182        let b = dir.join("b.wav");
183        std::fs::write(&a, b"x").unwrap();
184        std::fs::write(&b, b"x").unwrap();
185        let entries = vec![
186            (a.to_string_lossy().into_owned(), 1, "audio".into()),
187            (b.to_string_lossy().into_owned(), 1, "audio".into()),
188        ];
189        let r = find_byte_duplicate_groups(entries, None);
190        assert_eq!(r.groups.len(), 1);
191        assert_eq!(r.groups[0].paths.len(), 2);
192        let _ = std::fs::remove_dir_all(&dir);
193    }
194}