Low CPU utilization during image hash comparison

Hi, was just wondering why is the CPU utilization low during image hash comparison phase of the scan? Every other phase has some obvious bottleneck (disk/cpu at max), but this one confuses me. How is the comparison implemented? Great work btw!

For now the main bottleneck is that algorithm works only on one thread

Code -

czkawka/czkawka_core/src/similar_images.rs

Lines 718 to 836 in c88d347

    
           if similarity >= 1 { 
        
               if self.fast_comparing { 
        
                   this_time_check_hashes = all_hashes_to_check.clone(); 
        
                   if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { 
        
                       // End thread which send info to gui 
        
                       progress_thread_run.store(false, Ordering::Relaxed); 
        
                       progress_thread_handle.join().unwrap(); 
        
                       return false; 
        
                   } 
        
                   for (hash, mut vec_file_entry) in this_time_check_hashes.into_iter() { 
        
                       atomic_mode_counter.fetch_add(1, Ordering::Relaxed); 
        
                       // It is not available, because in same iteration, was already taken out 
        
                       if !all_hashes_to_check.contains_key(&hash) { 
        
                           continue; 
        
                       } 
        
                       // Finds hashes with specific distance to original one 
        
                       let vector_with_found_similar_hashes = self 
        
                           .bktree 
        
                           .find(&hash, similarity) 
        
                           .filter(|(similarity, hash)| *similarity != 0 && available_hashes.contains_key(*hash)) 
        
                           .collect::<Vec<_>>(); 
        
                       // Not found any hash with specific distance 
        
                       if vector_with_found_similar_hashes.is_empty() { 
        
                           continue; 
        
                       } 
        
                       // Current checked hash isn't in any group of similarity, so we create one, because found similar images 
        
                       if !master_of_group.contains(&hash) { 
        
                           master_of_group.insert(hash.clone()); 
        
                           collected_similar_images.insert(hash.clone(), Vec::new()); 
        
                           let _ = available_hashes.remove(&hash); // Cannot be used anymore as non master 
        
                           collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_file_entry); 
        
                           // This shouldn't be executed too much times, so it should be quite fast to check this 
        
                           if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { 
        
                               // End thread which send info to gui 
        
                               progress_thread_run.store(false, Ordering::Relaxed); 
        
                               progress_thread_handle.join().unwrap(); 
        
                               return false; 
        
                           } 
        
                       } 
        
                       vector_with_found_similar_hashes.iter().for_each(|(similarity, other_hash)| { 
        
                           let _ = all_hashes_to_check.remove(*other_hash); // Cannot be used anymore as master record 
        
                           let mut vec_fe = available_hashes.remove(*other_hash).unwrap(); 
        
                           for fe in &mut vec_fe { 
        
                               fe.similarity = Similarity::Similar(*similarity) 
        
                           } 
        
                           collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe); 
        
                       }); 
        
                   } 
        
               } else { 
        
                   for current_similarity in 1..=similarity { 
        
                       this_time_check_hashes = all_hashes_to_check.clone(); 
        
                       if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { 
        
                           // End thread which send info to gui 
        
                           progress_thread_run.store(false, Ordering::Relaxed); 
        
                           progress_thread_handle.join().unwrap(); 
        
                           return false; 
        
                       } 
        
                       for (hash, mut vec_file_entry) in this_time_check_hashes.into_iter() { 
        
                           atomic_mode_counter.fetch_add(1, Ordering::Relaxed); 
        
                           // It is not available, because in same iteration, was already taken out 
        
                           if !all_hashes_to_check.contains_key(&hash) { 
        
                               continue; 
        
                           } 
        
                           // Finds hashes with specific distance to original one 
        
                           let vector_with_found_similar_hashes = self 
        
                               .bktree 
        
                               .find(&hash, similarity) 
        
                               .filter(|(similarity, hash)| (*similarity == current_similarity) && available_hashes.contains_key(*hash)) 
        
                               .collect::<Vec<_>>(); 
        
                           // Not found any hash with specific distance 
        
                           if vector_with_found_similar_hashes.is_empty() { 
        
                               continue; 
        
                           } 
        
                           // Current checked hash isn't in any group of similarity, so we create one, because found similar images 
        
                           if !master_of_group.contains(&hash) { 
        
                               master_of_group.insert(hash.clone()); 
        
                               collected_similar_images.insert(hash.clone(), Vec::new()); 
        
                               let _ = available_hashes.remove(&hash); // Cannot be used anymore as non master 
        
                               collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_file_entry); 
        
                               // This shouldn't be executed too much times, so it should be quite fast to check this 
        
                               if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { 
        
                                   // End thread which send info to gui 
        
                                   progress_thread_run.store(false, Ordering::Relaxed); 
        
                                   progress_thread_handle.join().unwrap(); 
        
                                   return false; 
        
                               } 
        
                           } 
        
                           vector_with_found_similar_hashes.iter().for_each(|(similarity, other_hash)| { 
        
                               let _ = all_hashes_to_check.remove(*other_hash); // Cannot be used anymore as master record 
        
                               let mut vec_fe = available_hashes.remove(*other_hash).unwrap(); 
        
                               for fe in &mut vec_fe { 
        
                                   fe.similarity = Similarity::Similar(*similarity) 
        
                               } 
        
                               collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe); 
        
                           }); 
        
                       } 
        
                   } 
        
               } 
        
           }

Adding multithreading to it is not so easy, because finding objects is related to list of excluded items, that was modified in previous iterations, so simple and fast solutions of dividing tasks to 4,8 or 16 parts are not available here.

But even with support of multiple threads, it would be still quite slow because with big number of hashes, each hash needs to be compared with every else multiple times.

	if similarity >= 1 {
	if self.fast_comparing {
	this_time_check_hashes = all_hashes_to_check.clone();

	if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
	// End thread which send info to gui
	progress_thread_run.store(false, Ordering::Relaxed);
	progress_thread_handle.join().unwrap();
	return false;
	}

	for (hash, mut vec_file_entry) in this_time_check_hashes.into_iter() {
	atomic_mode_counter.fetch_add(1, Ordering::Relaxed);

	// It is not available, because in same iteration, was already taken out
	if !all_hashes_to_check.contains_key(&hash) {
	continue;
	}

	// Finds hashes with specific distance to original one
	let vector_with_found_similar_hashes = self
	.bktree
	.find(&hash, similarity)
	.filter(\|(similarity, hash)\| similarity != 0 && available_hashes.contains_key(hash))
	.collect::<Vec<_>>();

	// Not found any hash with specific distance
	if vector_with_found_similar_hashes.is_empty() {
	continue;
	}

	// Current checked hash isn't in any group of similarity, so we create one, because found similar images
	if !master_of_group.contains(&hash) {
	master_of_group.insert(hash.clone());
	collected_similar_images.insert(hash.clone(), Vec::new());
	let _ = available_hashes.remove(&hash); // Cannot be used anymore as non master

	collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_file_entry);

	// This shouldn't be executed too much times, so it should be quite fast to check this
	if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
	// End thread which send info to gui
	progress_thread_run.store(false, Ordering::Relaxed);
	progress_thread_handle.join().unwrap();
	return false;
	}
	}

	vector_with_found_similar_hashes.iter().for_each(\|(similarity, other_hash)\| {
	let _ = all_hashes_to_check.remove(*other_hash); // Cannot be used anymore as master record
	let mut vec_fe = available_hashes.remove(*other_hash).unwrap();
	for fe in &mut vec_fe {
	fe.similarity = Similarity::Similar(*similarity)
	}

	collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe);
	});
	}
	} else {
	for current_similarity in 1..=similarity {
	this_time_check_hashes = all_hashes_to_check.clone();

	if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
	// End thread which send info to gui
	progress_thread_run.store(false, Ordering::Relaxed);
	progress_thread_handle.join().unwrap();
	return false;
	}

	for (hash, mut vec_file_entry) in this_time_check_hashes.into_iter() {
	atomic_mode_counter.fetch_add(1, Ordering::Relaxed);

	// It is not available, because in same iteration, was already taken out
	if !all_hashes_to_check.contains_key(&hash) {
	continue;
	}

	// Finds hashes with specific distance to original one
	let vector_with_found_similar_hashes = self
	.bktree
	.find(&hash, similarity)
	.filter(\|(similarity, hash)\| (similarity == current_similarity) && available_hashes.contains_key(hash))
	.collect::<Vec<_>>();

	// Not found any hash with specific distance
	if vector_with_found_similar_hashes.is_empty() {
	continue;
	}

	// Current checked hash isn't in any group of similarity, so we create one, because found similar images
	if !master_of_group.contains(&hash) {
	master_of_group.insert(hash.clone());
	collected_similar_images.insert(hash.clone(), Vec::new());
	let _ = available_hashes.remove(&hash); // Cannot be used anymore as non master

	collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_file_entry);

	// This shouldn't be executed too much times, so it should be quite fast to check this
	if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
	// End thread which send info to gui
	progress_thread_run.store(false, Ordering::Relaxed);
	progress_thread_handle.join().unwrap();
	return false;
	}
	}

	vector_with_found_similar_hashes.iter().for_each(\|(similarity, other_hash)\| {
	let _ = all_hashes_to_check.remove(*other_hash); // Cannot be used anymore as master record
	let mut vec_fe = available_hashes.remove(*other_hash).unwrap();
	for fe in &mut vec_fe {
	fe.similarity = Similarity::Similar(*similarity)
	}

	collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe);
	});
	}
	}
	}
	}