duckdblabs/duckplyr

Filter pushdown discrepancy between duckdb and duckplyr

Opened this issue · 0 comments

This issue is related to the discussion in this comment:
#145 (comment)

The following code demonstrates the difference in filter
pushdown between DuckDB and Duckplyr as shown by the explain()
function.

With Duckplyr, the filter does not appear to be pushed down to the
Parquet files.

library(duckdb)
#> Loading required package: DBI
library(duckplyr)
#> ✔ Overwriting dplyr methods with duckplyr methods.
#> ℹ Turn off with `duckplyr::methods_restore()`.
#> 
#> Attaching package: 'duckplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(dbplyr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:dbplyr':
#> 
#>     ident, sql
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
# DuckDB ------------------------------------------------------------------

con <- dbConnect(duckdb())

dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")
#> <duckdb_result af590 connection=7ce20 statement='INSTALL httpfs; LOAD httpfs;'>
dbSendQuery(con, "SET s3_region='auto';SET s3_endpoint='';")
#> <duckdb_result e70c0 connection=7ce20 statement='SET s3_region='auto';SET s3_endpoint='';'>
f_duckdb <- function() {
  df <- tbl(
    con,
    "read_parquet('s3://duckplyr-demo-taxi-data/taxi-data-2019-partitioned/*/*.parquet')"
  )

  df |>
    filter(total_amount > 0L) |>
    filter(!is.na(passenger_count)) |>
    mutate(tip_pct = 100 * tip_amount / total_amount) |>
    summarise(
      avg_tip_pct = median(tip_pct),
      n = n(),
      .by = passenger_count
    ) |>
    arrange(desc(passenger_count))
}

# Duckplyr ----------------------------------------------------------------

con <- duckplyr:::get_default_duckdb_connection()

dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")
#> <duckdb_result bab60 connection=cd550 statement='INSTALL httpfs; LOAD httpfs;'>
dbSendQuery(con, "SET s3_region='auto';SET s3_endpoint='';")
#> <duckdb_result e8960 connection=cd550 statement='SET s3_region='auto';SET s3_endpoint='';'>
f_duckplyr <- function() {
  duckplyr::duckplyr_df_from_file(
    "s3://duckplyr-demo-taxi-data/taxi-data-2019-partitioned/*/*.parquet",
    "read_parquet",
    options = list(hive_partitioning = TRUE),
    class = class(tibble())
  ) |>
    filter(total_amount > 0L) |>
    filter(!is.na(passenger_count)) |>
    mutate(tip_pct = 100 * tip_amount / total_amount) |>
    summarise(
      avg_tip_pct = median(tip_pct),
      n = n(),
      .by = passenger_count
    ) |>
    arrange(desc(passenger_count))
}

explain(f_duckdb())
#> Warning: Missing values are always removed in SQL aggregation functions.
#> Use `na.rm = TRUE` to silence this warning
#> This warning is displayed once every 8 hours.
#> <SQL>
#> SELECT
#>   passenger_count,
#>   PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tip_pct) AS avg_tip_pct,
#>   COUNT(*) AS n
#> FROM (
#>   SELECT q01.*, (100.0 * tip_amount) / total_amount AS tip_pct
#>   FROM (FROM read_parquet('s3://duckplyr-demo-taxi-data/taxi-data-2019-partitioned/*/*.parquet')) q01
#>   WHERE (total_amount > 0) AND (NOT((passenger_count IS NULL)))
#> ) q01
#> GROUP BY passenger_count
#> ORDER BY passenger_count DESC
#> 
#> <PLAN>
#> physical_plan
#> ┌───────────────────────────┐
#> │          ORDER_BY         │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          ORDERS:          │
#> │  q01.passenger_count DESC │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │       HASH_GROUP_BY       │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │             #0            │
#> │     quantile_cont(#1)     │
#> │        count_star()       │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │          tip_pct          │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │          tip_pct          │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │           FILTER          │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │  (NOT (passenger_count IS │
#> │           NULL))          │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │        EC: 3524697        │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │       READ_PARQUET        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │         tip_amount        │
#> │        total_amount       │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │ Filters: total_amount>0.0 │
#> │ AND total_amount IS N...  │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │        EC: 17623488       │
#> └───────────────────────────┘
explain(f_duckplyr())
#> ┌───────────────────────────┐
#> │          ORDER_BY         │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          ORDERS:          │
#> │           #3 ASC          │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │        avg_tip_pct        │
#> │             n             │
#> │     -(passenger_count)    │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │        avg_tip_pct        │
#> │             n             │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │       HASH_GROUP_BY       │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │             #0            │
#> │         median(#1)        │
#> │        count_star()       │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │          tip_pct          │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │          tip_pct          │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │      passenger_count      │
#> │         tip_amount        │
#> │        total_amount       │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │           FILTER          │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │(r_base::>(total_amount, 0)│
#> │ AND (NOT ((passenger_count│
#> │   IS NULL) OR isnan(CAST  │
#> │(passenger_count AS DO...  │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │        EC: 17623488       │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │       READ_PARQUET        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │        total_amount       │
#> │      passenger_count      │
#> │         tip_amount        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │        EC: 88117440       │
#> └───────────────────────────┘

As a result, duckplyr is slightly slower compared to duckdb.

bench::mark(f_duckdb(), f_duckplyr(), check = FALSE)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 f_duckdb()      1.29s    1.29s     0.777     336KB     1.55
#> 2 f_duckplyr()    6.95s    6.95s     0.144     103KB     0

Created on 2024-05-21 with reprex v2.1.0

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.0 (2024-04-24)
#>  os       Linux Mint 21.3
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language en_CA:en
#>  collate  en_CA.UTF-8
#>  ctype    en_CA.UTF-8
#>  tz       America/Montreal
#>  date     2024-05-21
#>  pandoc   3.1.11 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/x86_64/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  bench         1.1.3   2023-05-04 [1] RSPM (R 4.4.0)
#>  blob          1.2.4   2023-03-17 [1] RSPM
#>  cli           3.6.2   2023-12-11 [1] RSPM
#>  collections   0.3.7   2023-01-05 [1] RSPM
#>  DBI         * 1.2.2   2024-02-16 [1] RSPM
#>  dbplyr      * 2.5.0   2024-03-19 [1] RSPM
#>  digest        0.6.35  2024-03-11 [1] RSPM
#>  dplyr       * 1.1.4   2023-11-17 [1] RSPM
#>  duckdb      * 0.10.2  2024-05-01 [1] CRAN (R 4.4.0)
#>  duckplyr    * 0.4.0   2024-05-21 [1] CRAN (R 4.4.0)
#>  evaluate      0.23    2023-11-01 [1] RSPM
#>  fansi         1.0.6   2023-12-08 [1] RSPM
#>  fastmap       1.2.0   2024-05-15 [1] CRAN (R 4.4.0)
#>  fs            1.6.4   2024-04-25 [1] CRAN (R 4.4.0)
#>  generics      0.1.3   2022-07-05 [1] RSPM
#>  glue          1.7.0   2024-01-09 [1] RSPM
#>  htmltools     0.5.8.1 2024-04-04 [1] RSPM
#>  knitr         1.46    2024-04-06 [1] RSPM
#>  lifecycle     1.0.4   2023-11-07 [1] RSPM
#>  magrittr      2.0.3   2022-03-30 [1] RSPM
#>  pillar        1.9.0   2023-03-22 [1] RSPM
#>  pkgconfig     2.0.3   2019-09-22 [1] RSPM
#>  profmem       0.6.0   2020-12-13 [1] RSPM (R 4.4.0)
#>  purrr         1.0.2   2023-08-10 [1] RSPM
#>  R.cache       0.16.0  2022-07-21 [1] RSPM
#>  R.methodsS3   1.8.2   2022-06-13 [1] RSPM
#>  R.oo          1.26.0  2024-01-24 [1] RSPM
#>  R.utils       2.12.3  2023-11-18 [1] RSPM
#>  R6            2.5.1   2021-08-19 [1] RSPM
#>  reprex        2.1.0   2024-01-11 [1] RSPM
#>  rlang         1.1.3   2024-01-10 [1] RSPM
#>  rmarkdown     2.27    2024-05-17 [1] RSPM (R 4.4.0)
#>  rstudioapi    0.16.0  2024-03-24 [1] RSPM
#>  sessioninfo   1.2.2   2021-12-06 [1] RSPM
#>  styler        1.10.3  2024-04-07 [1] RSPM
#>  tibble        3.2.1   2023-03-20 [1] RSPM
#>  tidyselect    1.2.1   2024-03-11 [1] RSPM
#>  utf8          1.2.4   2023-10-22 [1] RSPM
#>  vctrs         0.6.5   2023-12-01 [1] RSPM
#>  withr         3.0.0   2024-01-16 [1] RSPM
#>  xfun          0.44    2024-05-15 [1] CRAN (R 4.4.0)
#>  yaml          2.3.8   2023-12-11 [1] RSPM
#> 
#>  [1] /home/filoche/R/x86_64-pc-linux-gnu-library/4.4
#>  [2] /usr/local/lib/R/site-library
#>  [3] /usr/lib/R/site-library
#>  [4] /usr/lib/R/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────