1use serde::Serialize;
7use sha2::{Digest, Sha256};
8use std::collections::HashMap;
9use std::fs::File;
10use std::io::{BufReader, Read};
11use std::path::Path;
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::Arc;
14
15use rayon::prelude::*;
16use tauri::{AppHandle, Emitter};
17
18const READ_CHUNK: usize = 1024 * 1024;
19
20pub fn hash_file_sha256(path: &Path) -> Option<String> {
22 let file = File::open(path).ok()?;
23 let mut reader = BufReader::with_capacity(READ_CHUNK, file);
24 let mut hasher = Sha256::new();
25 let mut buf = vec![0u8; READ_CHUNK];
26 loop {
27 let n = reader.read(&mut buf).ok()?;
28 if n == 0 {
29 break;
30 }
31 hasher.update(&buf[..n]);
32 }
33 Some(format!("{:x}", hasher.finalize()))
34}
35
36#[derive(Debug, Clone, Serialize)]
37pub struct ContentDupPath {
38 pub path: String,
39 pub kind: String,
41}
42
43#[derive(Debug, Serialize)]
44pub struct ContentDupGroup {
45 pub hash_hex: String,
46 pub size_bytes: u64,
47 pub paths: Vec<ContentDupPath>,
48}
49
50#[derive(Debug, Serialize)]
51pub struct ContentDupScanResult {
52 pub groups: Vec<ContentDupGroup>,
53 pub files_hashed: usize,
55 pub skipped: usize,
57}
58
59pub fn find_byte_duplicate_groups(
61 entries: Vec<(String, u64, String)>,
62 progress: Option<(Arc<AppHandle>, usize)>,
63) -> ContentDupScanResult {
64 let mut size_map: HashMap<u64, Vec<(String, String)>> = HashMap::new();
65 for (path, sz, kind) in entries {
66 size_map.entry(sz).or_default().push((path, kind));
67 }
68
69 let mut to_hash: Vec<(String, String, u64)> = Vec::new();
70 for (sz, paths) in size_map {
71 if paths.len() < 2 {
72 continue;
73 }
74 for (p, k) in paths {
75 to_hash.push((p, k, sz));
76 }
77 }
78
79 let total = to_hash.len();
80 if total == 0 {
81 return ContentDupScanResult {
82 groups: vec![],
83 files_hashed: 0,
84 skipped: 0,
85 };
86 }
87
88 let done_ctr = AtomicUsize::new(0);
89 let skipped_ctr = AtomicUsize::new(0);
90
91 let hashed: Vec<(String, String, u64, String)> = to_hash
92 .into_par_iter()
93 .filter_map(|(path, kind, sz)| {
94 let p = Path::new(&path);
95 let h = match hash_file_sha256(p) {
96 Some(x) => x,
97 None => {
98 skipped_ctr.fetch_add(1, Ordering::Relaxed);
99 return None;
100 }
101 };
102 if let Some((app, every)) = progress.as_ref() {
103 let n = done_ctr.fetch_add(1, Ordering::Relaxed) + 1;
104 if *every > 0 && (n % *every == 0 || n == total) {
105 let _ = app.emit(
106 "content-dup-progress",
107 serde_json::json!({ "done": n, "total": total }),
108 );
109 }
110 } else {
111 done_ctr.fetch_add(1, Ordering::Relaxed);
112 }
113 Some((path, kind, sz, h))
114 })
115 .collect();
116
117 let mut by_hash: HashMap<String, Vec<(String, String, u64)>> = HashMap::new();
118 for (path, kind, sz, h) in hashed {
119 by_hash.entry(h).or_default().push((path, kind, sz));
120 }
121
122 let mut groups: Vec<ContentDupGroup> = by_hash
123 .into_iter()
124 .filter(|(_, paths)| paths.len() > 1)
125 .map(|(hash_hex, mut paths)| {
126 paths.sort_by(|a, b| a.0.cmp(&b.0));
127 let size_bytes = paths[0].2;
128 let paths = paths
129 .into_iter()
130 .map(|(path, kind, _)| ContentDupPath { path, kind })
131 .collect();
132 ContentDupGroup {
133 hash_hex,
134 size_bytes,
135 paths,
136 }
137 })
138 .collect();
139
140 groups.sort_by(|a, b| a.hash_hex.cmp(&b.hash_hex));
141
142 ContentDupScanResult {
143 files_hashed: total,
144 skipped: skipped_ctr.into_inner(),
145 groups,
146 }
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152
153 fn test_dir(name: &str) -> std::path::PathBuf {
154 let p = std::env::temp_dir().join(format!(
155 "ah_content_hash_{}_{}",
156 std::process::id(),
157 name
158 ));
159 let _ = std::fs::create_dir_all(&p);
160 p
161 }
162
163 #[test]
164 fn identical_files_same_hash() {
165 let dir = test_dir("same");
166 let a = dir.join("a.bin");
167 let b = dir.join("b.bin");
168 std::fs::write(&a, b"hello").unwrap();
169 std::fs::write(&b, b"hello").unwrap();
170 assert_eq!(
171 hash_file_sha256(&a),
172 hash_file_sha256(&b),
173 "same bytes => same SHA-256"
174 );
175 let _ = std::fs::remove_dir_all(&dir);
176 }
177
178 #[test]
179 fn find_groups_two_identical() {
180 let dir = test_dir("dup");
181 let a = dir.join("a.wav");
182 let b = dir.join("b.wav");
183 std::fs::write(&a, b"x").unwrap();
184 std::fs::write(&b, b"x").unwrap();
185 let entries = vec![
186 (a.to_string_lossy().into_owned(), 1, "audio".into()),
187 (b.to_string_lossy().into_owned(), 1, "audio".into()),
188 ];
189 let r = find_byte_duplicate_groups(entries, None);
190 assert_eq!(r.groups.len(), 1);
191 assert_eq!(r.groups[0].paths.len(), 2);
192 let _ = std::fs::remove_dir_all(&dir);
193 }
194}