{dplyr} functions may be slower than {base} variants
bedantaguru opened this issue · 6 comments
bedantaguru commented
I think few of them can be transferred to base without changing much of the codebase
Like
- select
- rename
bedantaguru commented
Performance results for select
and rename
library(magrittr)
select_base_nse <- function(data, ...){
el <- rlang::exprs(...)
if(length(el)>0){
sels <- as.character(el)
if(any(stringr::str_detect(sels,"-"))){
rems <- stringr::str_remove(sels,"-") %>% stringr::str_trim()
data <- data[setdiff(colnames(data),rems)]
}else{
data <- data[as.character(el)]
nms <- names(el)
if(!is.null(nms)){
nms <- nms[nchar(nms)>0]
eln <- el[nms]
if(length(eln)>0){
data <- rename_base(data, new_names = nms, old_names = as.character(eln))
}
}
}
}
data
}
rename_base <- function(data, old_names, new_names){
cn <- colnames(data)
cnt <- seq_along(cn)
names(cnt) <- cn
cn[cnt[old_names]] <- new_names
colnames(data) <- cn
data
}
rename_base_nse <- function(data, ...){
el <- rlang::exprs(...)
if(length(el)>0){
rns <- names(el)
ons <- as.character(el)
data <- rename_base(data, new_names = rns, old_names = ons)
}
data
}
microbenchmark::microbenchmark(iris[c("Sepal.Length", "Sepal.Width")],
select_base_nse(iris, Sepal.Length, Sepal.Width),
dplyr::select(iris, Sepal.Length, Sepal.Width))
#> Unit: microseconds
#> expr min lq
#> iris[c("Sepal.Length", "Sepal.Width")] 19.245 29.0815
#> select_base_nse(iris, Sepal.Length, Sepal.Width) 107.342 151.1770
#> dplyr::select(iris, Sepal.Length, Sepal.Width) 3164.217 3782.3915
#> mean median uq max neval
#> 46.03761 34.8545 45.3320 563.221 100
#> 536.62575 185.3885 250.8195 21389.570 100
#> 11904.59006 4771.9840 5913.3940 687420.029 100
microbenchmark::microbenchmark(iris[c("Sepal.Length", "Sepal.Width")] %>%
rename_base(old_names = "Sepal.Width",new_names = "tst"),
select_base_nse(iris, Sepal.Length, tst = Sepal.Width),
dplyr::select(iris, Sepal.Length, tst = Sepal.Width))
#> Unit: microseconds
#> expr
#> iris[c("Sepal.Length", "Sepal.Width")] %>% rename_base(old_names = "Sepal.Width", new_names = "tst")
#> select_base_nse(iris, Sepal.Length, tst = Sepal.Width)
#> dplyr::select(iris, Sepal.Length, tst = Sepal.Width)
#> min lq mean median uq max neval
#> 146.686 203.992 282.3938 234.5685 306.6285 1394.582 100
#> 113.329 165.503 256.7217 197.5770 233.0720 4912.468 100
#> 2988.879 3273.697 4284.6715 3810.1890 4915.2480 11775.896 100
microbenchmark::microbenchmark(rename_base(iris, old_names = "Sepal.Width",new_names = "tst"),
rename_base_nse(iris, tst = Sepal.Width),
dplyr::rename(iris, tst = Sepal.Width))
#> Unit: microseconds
#> expr min
#> rename_base(iris, old_names = "Sepal.Width", new_names = "tst") 8.126
#> rename_base_nse(iris, tst = Sepal.Width) 26.515
#> dplyr::rename(iris, tst = Sepal.Width) 1886.813
#> lq mean median uq max neval
#> 11.1195 19.46322 18.176 23.950 75.696 100
#> 44.6905 139.76679 56.879 73.771 7528.003 100
#> 2099.1435 2680.21856 2405.771 2993.796 7425.366 100
Created on 2020-04-01 by the reprex package (v0.3.0)
bedantaguru commented
The group_by
and summerise
is slower than aggregate
but has features which can not be replaced with base
suppressPackageStartupMessages(library(dplyr))
microbenchmark::microbenchmark(
iris %>% group_by(Species) %>% summarise(m = mean(Petal.Width)),
aggregate(iris["Petal.Width"], by = iris["Species"], mean)
)
#> Unit: microseconds
#> expr min
#> iris %>% group_by(Species) %>% summarise(m = mean(Petal.Width)) 5753.665
#> aggregate(iris["Petal.Width"], by = iris["Species"], mean) 711.617
#> lq mean median uq max neval
#> 6590.798 8429.525 7926.363 8722.229 45395.958 100
#> 878.403 1171.984 1016.535 1310.548 5534.706 100
Created on 2020-04-01 by the reprex package (v0.3.0)
bedantaguru commented
test for inner_join
suppressPackageStartupMessages(library(dplyr))
inner_join_base <- function(x, y, by = NULL, suffix = c(".x",".y")){
if(is.null(by)){
by <- intersect(colnames(x), colnames(y))
}
nmd <- !is.null(names(by))
if(nmd){
merge(x, y, by.x = names(by), by.y = as.character(by), all = F)
}else{
merge(x, y, by = by, all = F)
}
}
N <- 1e3
d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
d2 <- tibble(x = rbinom(N, 20, runif(N)/2), LTbig = LETTERS[sample(26, size = N, replace = T)])
microbenchmark::microbenchmark(inner_join(d1, d2, by = "x"),
inner_join_base(d1, d2, by = "x"))
#> Unit: milliseconds
#> expr min lq mean median
#> inner_join(d1, d2, by = "x") 5.779751 6.715674 9.149584 7.771339
#> inner_join_base(d1, d2, by = "x") 107.614133 121.283896 137.676605 132.975117
#> uq max neval
#> 9.656868 39.93566 100
#> 145.775458 218.41258 100
N <- 1e1
d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
d2 <- tibble(x = rbinom(N, 20, runif(N)/2), LTbig = LETTERS[sample(26, size = N, replace = T)])
microbenchmark::microbenchmark(inner_join(d1, d2, by = "x"),
inner_join_base(d1, d2, by = "x"))
#> Unit: microseconds
#> expr min lq mean median
#> inner_join(d1, d2, by = "x") 4712.326 5212.467 6553.3317 5834.278
#> inner_join_base(d1, d2, by = "x") 599.572 711.618 901.5904 808.482
#> uq max neval
#> 7069.130 22728.984 100
#> 1042.195 3578.614 100
Created on 2020-04-01 by the reprex package (v0.3.0)
on large data (n>300), it is faster.
bedantaguru commented
bedantaguru commented
Test for filter
library(dplyr, quietly = T)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
N <- 1e4
d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
microbenchmark::microbenchmark(
d1 %>% filter(x>5, lt<"t"),
d1[d1$x>5 & d1$lt< "t",]
)
#> Unit: milliseconds
#> expr min lq mean median uq
#> d1 %>% filter(x > 5, lt < "t") 8.783170 9.319876 11.468130 9.897210 13.040256
#> d1[d1$x > 5 & d1$lt < "t", ] 4.828221 5.028363 6.140927 5.166494 6.557656
#> max neval
#> 24.62050 100
#> 13.52971 100
Created on 2020-04-03 by the reprex package (v0.3.0)
bedantaguru commented
See this SO.