parse_list() can become slow for very large data
Closed this issue · 1 comments
privefl commented
To reproduce:
n <- 7
s <- setNames(replicate(n, {
unique(sample(1e7, size = 1e6, replace = TRUE))
}, simplify = FALSE), LETTERS[1:n])
system.time(test <- eulerr:::parse_list(s))
privefl commented
One solution would be to use memoisation, i.e. to store intersections you have already computed and reuse them.
E.g. using
parse_list2 <- function(combinations) {
sets <- names(combinations)
n <- length(sets)
id <- eulerr:::bit_indexr(n)
rownames(id) <- apply(id, 1L, function(x) paste(sets[x], collapse = "&"))
intersect_sets <- as.list(rep(-1, nrow(id)))
names(intersect_sets) <- rownames(id)
compute_intersect <- function(bool) {
ind <- which(bool)
nm <- paste(sets[ind], collapse = "&")
if (identical(intersect_sets[[nm]], -1)) { # not computed yet
if (length(ind) == 1) {
intersect_sets[[nm]] <<- combinations[[ind]]
} else {
bool[] <- FALSE; bool[ind[1]] <- TRUE
part1 <- compute_intersect(bool)
bool[ind] <- TRUE; bool[ind[1]] <- FALSE
part2 <- compute_intersect(bool)
intersect_sets[[nm]] <<- intersect(part1, part2)
}
}
intersect_sets[[nm]]
}
lengths(apply(id, 1, compute_intersect))
}
For large data and increasing number of combinations, it matters a lot:
n <- 7
s <- setNames(replicate(n, {
unique(sample(1e7, size = 1e6, replace = TRUE))
}, simplify = FALSE), LETTERS[1:n])
system.time(test <- eulerr:::parse_list(s))
system.time(test2 <- parse_list2(s))
all.equal(test, test2)