vigna/webgraph-rs

LLP: Failure to serialize labels causes a segfault

Opened this issue · 5 comments

For some reason, when we early-return from this line (eg. because we ran out of disk in the temp dir):

.context("Could not serialize labels")?;

then dropping the LabelStore segfaults.

For example, with this patch:

diff --git a/src/algo/llp/mod.rs b/src/algo/llp/mod.rs
index 2b5f514..6f14043 100644
--- a/src/algo/llp/mod.rs
+++ b/src/algo/llp/mod.rs
@@ -43,6 +43,7 @@ use rand::SeedableRng;
 use rayon::prelude::*;
 use std::collections::HashMap;
 use std::env::temp_dir;
+use std::mem::ManuallyDrop;
 use std::path::PathBuf;
 use std::sync::atomic::Ordering;
 use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize};
@@ -152,6 +153,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
             .iter()
             .for_each(|x| x.store(true, Ordering::Relaxed));
 
+        /*
         for update in 0.. {
             update_pl.start(format!("Starting update {}...", update));
 
@@ -270,6 +272,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
                 break;
             }
         }
+        */
 
         iter_pl.done();
 
@@ -295,11 +298,16 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
         costs.push(cost);
 
         // storing the perms
+        let path = labels_path(gamma_index);
+        info!("Creating {}", path.display());
         let mut file =
-            std::fs::File::create(labels_path(gamma_index)).context("Could not write labels")?;
-        labels
-            .serialize(&mut file)
-            .context("Could not serialize labels")?;
+            std::fs::File::create(&path).context("Could not write labels")?;
+        info!("Writing {}", path.display());
+        let res = labels
+            .serialize(&mut file);
+        info!("Res {:?}", res);
+        res.context("Could not serialize labels")?;
+        info!("Done writing {}", path.display());
 
         gamma_pl.update_and_display();
     }
diff --git a/src/cli/llp.rs b/src/cli/llp.rs
index 99c0151..d77a809 100644
--- a/src/cli/llp.rs
+++ b/src/cli/llp.rs
@@ -85,7 +85,7 @@ pub fn cli(command: Command) -> Command {
 pub fn main(submatches: &ArgMatches) -> Result<()> {
     let args = CliArgs::from_arg_matches(submatches)?;
 
-    match get_endianness(&args.basename)?.as_str() {
+    let main_res = match get_endianness(&args.basename)?.as_str() {
         #[cfg(any(
             feature = "be_bins",
             not(any(feature = "be_bins", feature = "le_bins"))
@@ -97,7 +97,10 @@ pub fn main(submatches: &ArgMatches) -> Result<()> {
         ))]
         LE::NAME => llp_impl::<LE>(args),
         e => panic!("Unknown endianness: {}", e),
-    }
+    };
+
+    log::info!("main res {:?}", main_res);
+    main_res
 }
 
 fn llp_impl<E: Endianness + 'static + Send + Sync>(args: CliArgs) -> Result<()>
@@ -157,7 +160,7 @@ where
     }
 
     // compute the LLP
-    let labels = llp::layered_label_propagation(
+    let res2 = llp::layered_label_propagation(
         &graph,
         &*deg_cumul,
         gammas,
@@ -166,8 +169,10 @@ where
         args.granularity,
         args.seed,
         predicate,
-    )
-    .context("Could not compute the LLP")?;
+    );
+    log::info!("res2 {:?}", res2);
+    let labels = res2.context("Could not compute the LLP")?;
+    log::info!("labels ok");
 
     let mut llp_perm = (0..graph.num_nodes()).collect::<Vec<_>>();
     llp_perm.par_sort_by(|&a, &b| labels[a].cmp(&labels[b]));

llp prints:

[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Log-gap cost: 68596432338
[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Creating /tmp/labels_0.bin
[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Writing /tmp/labels_0.bin
[2024-03-23T11:33:48Z INFO  webgraph::algo::llp] Res Err(WriteError)
Segmentation fault

and here is the traceback:

Thread 1 "webgraph" received signal SIGSEGV, Segmentation fault.                                                                        
__GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102                                                                                  
3102    malloc.c: No such file or directory.                                                                                            
(gdb) bt                                                                                                                                
#0  __GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102                                                                              
#1  0x00005555556459fa in alloc::alloc::dealloc (ptr=<optimized out>, layout=...) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:117                                                                                                              
#2  alloc::alloc::{impl#1}::deallocate (ptr=..., layout=..., self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:254                                                                                                             
#3  alloc::boxed::{impl#8}::drop<[core::sync::atomic::AtomicUsize], alloc::alloc::Global> (self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1243                                                                              
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
#5  core::ptr::drop_in_place<webgraph::algo::llp::label_store::LabelStore> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                                                 
#6  webgraph::algo::llp::layered_label_propagation<webgraph::graphs::bvgraph::random_access::BVGraph<webgraph::graphs::bvgraph::codecs::dec_dyn::DynCodesDecoderFactory<dsi_bitstream::traits::endianness::BigEndian, webgraph::graphs::bvgraph::codecs::factories::MemoryFactory
<dsi_bitstream::traits::endianness::BigEndian, webgraph::utils::mmap_helper::MmapHelper<u32, mmap_rs::mmap::Mmap>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_fixed2::SelectFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_ve
c::BitFieldVec<usize, &[usize]>>>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_zero_fixed2::SelectZeroFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_vec::BitFieldVec<usize, &[usize]>>, predicates::boxed::BoxPredicate<webgr
aph::algo::llp::preds::PredParams>> (sym_graph=<optimized out>, deg_cumul=<optimized out>, gammas=..., num_threads=..., chunk_size=..., granularity=..., seed=0, predicate=...) at src/algo/llp/mod.rs:357                                                                       
#7  0x00005555556c6b4d in webgraph::cli::llp::llp_impl<dsi_bitstream::traits::endianness::BigEndian> (args=...) at src/cli/llp.rs:163   
#8  webgraph::cli::llp::main (submatches=<optimized out>) at src/cli/llp.rs:93                                                          
#9  0x00005555555e222c in webgraph::main () at src/main.rs:70                                                                           
(gdb) f 4                                                                                                                               
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
507     /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs: No such file or directory.                         
(gdb) f                                                                                                                                 
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
507     in /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs                                                  

I assume this is due to the transmuted label_store.labels, but I don't see why the compiler would drop the transmuted before the original, let alone drop it at all. Wrapping in ManuallyDrop doesn't help.

This happens both in release and debug mode (I commented out the worker loop so it terminates within a reasonable time in debug mode)

I didn't try, this is on 1.76.0

Hmm I can't try on nightly because of a different issue (stack overflow despite setting ulimit -s 65533 which was enough on stable)

Hah, I forgot RUST_MIN_STACK=8388608. Yes, the segfault still happens on nightly.

Well, we need to get something much smaller and reproducible. But I have the gut feeling this is not gonna be easy.