LLP: Failure to serialize labels causes a segfault
Opened this issue · 5 comments
For some reason, when we early-return from this line (eg. because we ran out of disk in the temp dir):
webgraph-rs/src/algo/llp/mod.rs
Line 302 in 9cd0536
then dropping the LabelStore
segfaults.
For example, with this patch:
diff --git a/src/algo/llp/mod.rs b/src/algo/llp/mod.rs
index 2b5f514..6f14043 100644
--- a/src/algo/llp/mod.rs
+++ b/src/algo/llp/mod.rs
@@ -43,6 +43,7 @@ use rand::SeedableRng;
use rayon::prelude::*;
use std::collections::HashMap;
use std::env::temp_dir;
+use std::mem::ManuallyDrop;
use std::path::PathBuf;
use std::sync::atomic::Ordering;
use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize};
@@ -152,6 +153,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
.iter()
.for_each(|x| x.store(true, Ordering::Relaxed));
+ /*
for update in 0.. {
update_pl.start(format!("Starting update {}...", update));
@@ -270,6 +272,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
break;
}
}
+ */
iter_pl.done();
@@ -295,11 +298,16 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
costs.push(cost);
// storing the perms
+ let path = labels_path(gamma_index);
+ info!("Creating {}", path.display());
let mut file =
- std::fs::File::create(labels_path(gamma_index)).context("Could not write labels")?;
- labels
- .serialize(&mut file)
- .context("Could not serialize labels")?;
+ std::fs::File::create(&path).context("Could not write labels")?;
+ info!("Writing {}", path.display());
+ let res = labels
+ .serialize(&mut file);
+ info!("Res {:?}", res);
+ res.context("Could not serialize labels")?;
+ info!("Done writing {}", path.display());
gamma_pl.update_and_display();
}
diff --git a/src/cli/llp.rs b/src/cli/llp.rs
index 99c0151..d77a809 100644
--- a/src/cli/llp.rs
+++ b/src/cli/llp.rs
@@ -85,7 +85,7 @@ pub fn cli(command: Command) -> Command {
pub fn main(submatches: &ArgMatches) -> Result<()> {
let args = CliArgs::from_arg_matches(submatches)?;
- match get_endianness(&args.basename)?.as_str() {
+ let main_res = match get_endianness(&args.basename)?.as_str() {
#[cfg(any(
feature = "be_bins",
not(any(feature = "be_bins", feature = "le_bins"))
@@ -97,7 +97,10 @@ pub fn main(submatches: &ArgMatches) -> Result<()> {
))]
LE::NAME => llp_impl::<LE>(args),
e => panic!("Unknown endianness: {}", e),
- }
+ };
+
+ log::info!("main res {:?}", main_res);
+ main_res
}
fn llp_impl<E: Endianness + 'static + Send + Sync>(args: CliArgs) -> Result<()>
@@ -157,7 +160,7 @@ where
}
// compute the LLP
- let labels = llp::layered_label_propagation(
+ let res2 = llp::layered_label_propagation(
&graph,
&*deg_cumul,
gammas,
@@ -166,8 +169,10 @@ where
args.granularity,
args.seed,
predicate,
- )
- .context("Could not compute the LLP")?;
+ );
+ log::info!("res2 {:?}", res2);
+ let labels = res2.context("Could not compute the LLP")?;
+ log::info!("labels ok");
let mut llp_perm = (0..graph.num_nodes()).collect::<Vec<_>>();
llp_perm.par_sort_by(|&a, &b| labels[a].cmp(&labels[b]));
llp prints:
[2024-03-23T11:33:45Z INFO webgraph::algo::llp] Log-gap cost: 68596432338
[2024-03-23T11:33:45Z INFO webgraph::algo::llp] Creating /tmp/labels_0.bin
[2024-03-23T11:33:45Z INFO webgraph::algo::llp] Writing /tmp/labels_0.bin
[2024-03-23T11:33:48Z INFO webgraph::algo::llp] Res Err(WriteError)
Segmentation fault
and here is the traceback:
Thread 1 "webgraph" received signal SIGSEGV, Segmentation fault.
__GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102
3102 malloc.c: No such file or directory.
(gdb) bt
#0 __GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102
#1 0x00005555556459fa in alloc::alloc::dealloc (ptr=<optimized out>, layout=...) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:117
#2 alloc::alloc::{impl#1}::deallocate (ptr=..., layout=..., self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:254
#3 alloc::boxed::{impl#8}::drop<[core::sync::atomic::AtomicUsize], alloc::alloc::Global> (self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1243
#4 core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507
#5 core::ptr::drop_in_place<webgraph::algo::llp::label_store::LabelStore> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507
#6 webgraph::algo::llp::layered_label_propagation<webgraph::graphs::bvgraph::random_access::BVGraph<webgraph::graphs::bvgraph::codecs::dec_dyn::DynCodesDecoderFactory<dsi_bitstream::traits::endianness::BigEndian, webgraph::graphs::bvgraph::codecs::factories::MemoryFactory
<dsi_bitstream::traits::endianness::BigEndian, webgraph::utils::mmap_helper::MmapHelper<u32, mmap_rs::mmap::Mmap>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_fixed2::SelectFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_ve
c::BitFieldVec<usize, &[usize]>>>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_zero_fixed2::SelectZeroFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_vec::BitFieldVec<usize, &[usize]>>, predicates::boxed::BoxPredicate<webgr
aph::algo::llp::preds::PredParams>> (sym_graph=<optimized out>, deg_cumul=<optimized out>, gammas=..., num_threads=..., chunk_size=..., granularity=..., seed=0, predicate=...) at src/algo/llp/mod.rs:357
#7 0x00005555556c6b4d in webgraph::cli::llp::llp_impl<dsi_bitstream::traits::endianness::BigEndian> (args=...) at src/cli/llp.rs:163
#8 webgraph::cli::llp::main (submatches=<optimized out>) at src/cli/llp.rs:93
#9 0x00005555555e222c in webgraph::main () at src/main.rs:70
(gdb) f 4
#4 core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507
507 /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs: No such file or directory.
(gdb) f
#4 core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507
507 in /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs
I assume this is due to the transmuted label_store.labels
, but I don't see why the compiler would drop the transmuted before the original, let alone drop it at all. Wrapping in ManuallyDrop
doesn't help.
This happens both in release and debug mode (I commented out the worker loop so it terminates within a reasonable time in debug mode)
I didn't try, this is on 1.76.0
Hmm I can't try on nightly because of a different issue (stack overflow despite setting ulimit -s 65533
which was enough on stable)
Hah, I forgot RUST_MIN_STACK=8388608
. Yes, the segfault still happens on nightly.
Well, we need to get something much smaller and reproducible. But I have the gut feeling this is not gonna be easy.