{dplyr} functions may be slower than {base} variants

Question

{dplyr} functions may be slower than {base} variants

bedantaguru opened this issue 5 years ago · 6 comments

bedantaguru commented 5 years ago

I think few of them can be transferred to base without changing much of the codebase

Like

select
rename

Answer 1 · 2020-04-01T10:57:12.000Z

Performance results for select and rename

library(magrittr)

select_base_nse <- function(data, ...){
  el <- rlang::exprs(...)
  if(length(el)>0){
    sels <- as.character(el)
    if(any(stringr::str_detect(sels,"-"))){
      rems <- stringr::str_remove(sels,"-") %>% stringr::str_trim()
      data <- data[setdiff(colnames(data),rems)]
    }else{
      data <- data[as.character(el)]
      nms <- names(el)
      if(!is.null(nms)){
        nms <- nms[nchar(nms)>0]
        eln <- el[nms]
        if(length(eln)>0){
          data <- rename_base(data, new_names = nms, old_names = as.character(eln))
        }
      }
    }
  }
  data
}

rename_base <- function(data, old_names, new_names){
  cn <- colnames(data)
  cnt <- seq_along(cn)
  names(cnt) <- cn
  cn[cnt[old_names]] <- new_names
  colnames(data) <- cn
  data
}

rename_base_nse <- function(data, ...){
  el <- rlang::exprs(...)
  if(length(el)>0){
    rns <- names(el)
    ons <- as.character(el)
    data <- rename_base(data, new_names = rns, old_names = ons)
  }
  data
}


microbenchmark::microbenchmark(iris[c("Sepal.Length", "Sepal.Width")],
                               select_base_nse(iris, Sepal.Length, Sepal.Width),
                               dplyr::select(iris, Sepal.Length, Sepal.Width))
#> Unit: microseconds
#>                                              expr      min        lq
#>            iris[c("Sepal.Length", "Sepal.Width")]   19.245   29.0815
#>  select_base_nse(iris, Sepal.Length, Sepal.Width)  107.342  151.1770
#>    dplyr::select(iris, Sepal.Length, Sepal.Width) 3164.217 3782.3915
#>         mean    median        uq        max neval
#>     46.03761   34.8545   45.3320    563.221   100
#>    536.62575  185.3885  250.8195  21389.570   100
#>  11904.59006 4771.9840 5913.3940 687420.029   100


microbenchmark::microbenchmark(iris[c("Sepal.Length", "Sepal.Width")] %>% 
                                 rename_base(old_names = "Sepal.Width",new_names = "tst"),
                               select_base_nse(iris, Sepal.Length, tst = Sepal.Width),
                               dplyr::select(iris, Sepal.Length, tst = Sepal.Width))
#> Unit: microseconds
#>                                                                                                       expr
#>  iris[c("Sepal.Length", "Sepal.Width")] %>% rename_base(old_names = "Sepal.Width",      new_names = "tst")
#>                                                     select_base_nse(iris, Sepal.Length, tst = Sepal.Width)
#>                                                       dplyr::select(iris, Sepal.Length, tst = Sepal.Width)
#>       min       lq      mean    median        uq       max neval
#>   146.686  203.992  282.3938  234.5685  306.6285  1394.582   100
#>   113.329  165.503  256.7217  197.5770  233.0720  4912.468   100
#>  2988.879 3273.697 4284.6715 3810.1890 4915.2480 11775.896   100


microbenchmark::microbenchmark(rename_base(iris, old_names = "Sepal.Width",new_names = "tst"),
                               rename_base_nse(iris, tst = Sepal.Width),
                               dplyr::rename(iris, tst = Sepal.Width))
#> Unit: microseconds
#>                                                             expr      min
#>  rename_base(iris, old_names = "Sepal.Width", new_names = "tst")    8.126
#>                         rename_base_nse(iris, tst = Sepal.Width)   26.515
#>                           dplyr::rename(iris, tst = Sepal.Width) 1886.813
#>         lq       mean   median       uq      max neval
#>    11.1195   19.46322   18.176   23.950   75.696   100
#>    44.6905  139.76679   56.879   73.771 7528.003   100
#>  2099.1435 2680.21856 2405.771 2993.796 7425.366   100

^{Created on 2020-04-01 by the reprex package (v0.3.0)}

Answer 2 · 2020-04-01T11:01:27.000Z

The group_by and summerise is slower than aggregate but has features which can not be replaced with base

suppressPackageStartupMessages(library(dplyr))
microbenchmark::microbenchmark(
  iris %>% group_by(Species) %>% summarise(m = mean(Petal.Width)),
  
  aggregate(iris["Petal.Width"], by = iris["Species"], mean)
)
#> Unit: microseconds
#>                                                             expr      min
#>  iris %>% group_by(Species) %>% summarise(m = mean(Petal.Width)) 5753.665
#>       aggregate(iris["Petal.Width"], by = iris["Species"], mean)  711.617
#>        lq     mean   median       uq       max neval
#>  6590.798 8429.525 7926.363 8722.229 45395.958   100
#>   878.403 1171.984 1016.535 1310.548  5534.706   100

^{Created on 2020-04-01 by the reprex package (v0.3.0)}

Answer 3 · 2020-04-01T11:14:26.000Z

test for inner_join

suppressPackageStartupMessages(library(dplyr))

inner_join_base <- function(x, y, by = NULL, suffix = c(".x",".y")){
  if(is.null(by)){
    by <- intersect(colnames(x), colnames(y))
  }
  nmd <- !is.null(names(by))
  if(nmd){
    merge(x, y, by.x = names(by), by.y = as.character(by), all = F)
  }else{
    merge(x, y, by = by, all = F)
  }
}

N <- 1e3

d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
d2 <- tibble(x = rbinom(N, 20, runif(N)/2), LTbig = LETTERS[sample(26, size = N, replace = T)])


microbenchmark::microbenchmark(inner_join(d1, d2, by = "x"),
                               inner_join_base(d1, d2, by = "x"))
#> Unit: milliseconds
#>                               expr        min         lq       mean     median
#>       inner_join(d1, d2, by = "x")   5.779751   6.715674   9.149584   7.771339
#>  inner_join_base(d1, d2, by = "x") 107.614133 121.283896 137.676605 132.975117
#>          uq       max neval
#>    9.656868  39.93566   100
#>  145.775458 218.41258   100

N <- 1e1

d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
d2 <- tibble(x = rbinom(N, 20, runif(N)/2), LTbig = LETTERS[sample(26, size = N, replace = T)])


microbenchmark::microbenchmark(inner_join(d1, d2, by = "x"),
                               inner_join_base(d1, d2, by = "x"))
#> Unit: microseconds
#>                               expr      min       lq      mean   median
#>       inner_join(d1, d2, by = "x") 4712.326 5212.467 6553.3317 5834.278
#>  inner_join_base(d1, d2, by = "x")  599.572  711.618  901.5904  808.482
#>        uq       max neval
#>  7069.130 22728.984   100
#>  1042.195  3578.614   100

^{Created on 2020-04-01 by the reprex package (v0.3.0)}

on large data (n>300), it is faster.

Answer 4 · 2020-04-01T11:15:41.000Z

See tidyverse/dplyr#5079

Answer 5 · 2020-04-03T09:03:55.000Z

Test for filter

library(dplyr, quietly = T)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

N <- 1e4

d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])

microbenchmark::microbenchmark(
  d1 %>% filter(x>5, lt<"t"),
  d1[d1$x>5 & d1$lt< "t",]
)
#> Unit: milliseconds
#>                            expr      min       lq      mean   median        uq
#>  d1 %>% filter(x > 5, lt < "t") 8.783170 9.319876 11.468130 9.897210 13.040256
#>    d1[d1$x > 5 & d1$lt < "t", ] 4.828221 5.028363  6.140927 5.166494  6.557656
#>       max neval
#>  24.62050   100
#>  13.52971   100

^{Created on 2020-04-03 by the reprex package (v0.3.0)}

Answer 6 · 2020-04-03T09:32:30.000Z

See this SO.