Low CPU utilization during image hash comparison
Closed this issue · 1 comments
Hi, was just wondering why is the CPU utilization low during image hash comparison phase of the scan? Every other phase has some obvious bottleneck (disk/cpu at max), but this one confuses me. How is the comparison implemented? Great work btw!
For now the main bottleneck is that algorithm works only on one thread
Code -
czkawka/czkawka_core/src/similar_images.rs
Lines 718 to 836 in c88d347
if similarity >= 1 { | |
if self.fast_comparing { | |
this_time_check_hashes = all_hashes_to_check.clone(); | |
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { | |
// End thread which send info to gui | |
progress_thread_run.store(false, Ordering::Relaxed); | |
progress_thread_handle.join().unwrap(); | |
return false; | |
} | |
for (hash, mut vec_file_entry) in this_time_check_hashes.into_iter() { | |
atomic_mode_counter.fetch_add(1, Ordering::Relaxed); | |
// It is not available, because in same iteration, was already taken out | |
if !all_hashes_to_check.contains_key(&hash) { | |
continue; | |
} | |
// Finds hashes with specific distance to original one | |
let vector_with_found_similar_hashes = self | |
.bktree | |
.find(&hash, similarity) | |
.filter(|(similarity, hash)| *similarity != 0 && available_hashes.contains_key(*hash)) | |
.collect::<Vec<_>>(); | |
// Not found any hash with specific distance | |
if vector_with_found_similar_hashes.is_empty() { | |
continue; | |
} | |
// Current checked hash isn't in any group of similarity, so we create one, because found similar images | |
if !master_of_group.contains(&hash) { | |
master_of_group.insert(hash.clone()); | |
collected_similar_images.insert(hash.clone(), Vec::new()); | |
let _ = available_hashes.remove(&hash); // Cannot be used anymore as non master | |
collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_file_entry); | |
// This shouldn't be executed too much times, so it should be quite fast to check this | |
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { | |
// End thread which send info to gui | |
progress_thread_run.store(false, Ordering::Relaxed); | |
progress_thread_handle.join().unwrap(); | |
return false; | |
} | |
} | |
vector_with_found_similar_hashes.iter().for_each(|(similarity, other_hash)| { | |
let _ = all_hashes_to_check.remove(*other_hash); // Cannot be used anymore as master record | |
let mut vec_fe = available_hashes.remove(*other_hash).unwrap(); | |
for fe in &mut vec_fe { | |
fe.similarity = Similarity::Similar(*similarity) | |
} | |
collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe); | |
}); | |
} | |
} else { | |
for current_similarity in 1..=similarity { | |
this_time_check_hashes = all_hashes_to_check.clone(); | |
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { | |
// End thread which send info to gui | |
progress_thread_run.store(false, Ordering::Relaxed); | |
progress_thread_handle.join().unwrap(); | |
return false; | |
} | |
for (hash, mut vec_file_entry) in this_time_check_hashes.into_iter() { | |
atomic_mode_counter.fetch_add(1, Ordering::Relaxed); | |
// It is not available, because in same iteration, was already taken out | |
if !all_hashes_to_check.contains_key(&hash) { | |
continue; | |
} | |
// Finds hashes with specific distance to original one | |
let vector_with_found_similar_hashes = self | |
.bktree | |
.find(&hash, similarity) | |
.filter(|(similarity, hash)| (*similarity == current_similarity) && available_hashes.contains_key(*hash)) | |
.collect::<Vec<_>>(); | |
// Not found any hash with specific distance | |
if vector_with_found_similar_hashes.is_empty() { | |
continue; | |
} | |
// Current checked hash isn't in any group of similarity, so we create one, because found similar images | |
if !master_of_group.contains(&hash) { | |
master_of_group.insert(hash.clone()); | |
collected_similar_images.insert(hash.clone(), Vec::new()); | |
let _ = available_hashes.remove(&hash); // Cannot be used anymore as non master | |
collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_file_entry); | |
// This shouldn't be executed too much times, so it should be quite fast to check this | |
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { | |
// End thread which send info to gui | |
progress_thread_run.store(false, Ordering::Relaxed); | |
progress_thread_handle.join().unwrap(); | |
return false; | |
} | |
} | |
vector_with_found_similar_hashes.iter().for_each(|(similarity, other_hash)| { | |
let _ = all_hashes_to_check.remove(*other_hash); // Cannot be used anymore as master record | |
let mut vec_fe = available_hashes.remove(*other_hash).unwrap(); | |
for fe in &mut vec_fe { | |
fe.similarity = Similarity::Similar(*similarity) | |
} | |
collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe); | |
}); | |
} | |
} | |
} | |
} |
Adding multithreading to it is not so easy, because finding objects is related to list of excluded items, that was modified in previous iterations, so simple and fast solutions of dividing tasks to 4,8 or 16 parts are not available here.
But even with support of multiple threads, it would be still quite slow because with big number of hashes, each hash needs to be compared with every else multiple times.
- 👍1