jolars/eulerr

parse_list() can become slow for very large data

Closed this issue · 1 comments

To reproduce:

n <- 7
s <- setNames(replicate(n, {
  unique(sample(1e7, size = 1e6, replace = TRUE))
}, simplify = FALSE), LETTERS[1:n])

system.time(test <- eulerr:::parse_list(s))

One solution would be to use memoisation, i.e. to store intersections you have already computed and reuse them.

E.g. using

parse_list2 <- function(combinations) {
  sets <- names(combinations)
  n <- length(sets)
  id <- eulerr:::bit_indexr(n)
  rownames(id) <- apply(id, 1L, function(x) paste(sets[x], collapse = "&"))
  
  intersect_sets <- as.list(rep(-1, nrow(id)))
  names(intersect_sets) <- rownames(id)
  compute_intersect <- function(bool) {
    ind <- which(bool)
    nm <- paste(sets[ind], collapse = "&")
    if (identical(intersect_sets[[nm]], -1)) { # not computed yet
      if (length(ind) == 1) {
        intersect_sets[[nm]] <<- combinations[[ind]]
      } else {
        bool[] <- FALSE; bool[ind[1]] <- TRUE
        part1 <- compute_intersect(bool)
        bool[ind] <- TRUE; bool[ind[1]] <- FALSE
        part2 <- compute_intersect(bool)
        intersect_sets[[nm]] <<- intersect(part1, part2)
      }
    }
    intersect_sets[[nm]]
  }
  lengths(apply(id, 1, compute_intersect))
}

For large data and increasing number of combinations, it matters a lot:

n <- 7
s <- setNames(replicate(n, {
  unique(sample(1e7, size = 1e6, replace = TRUE))
}, simplify = FALSE), LETTERS[1:n])

system.time(test <- eulerr:::parse_list(s))
system.time(test2 <- parse_list2(s))
all.equal(test, test2)