tidyverse/duckplyr

Implement `sum(na.rm = FALSE)`

Opened this issue · 3 comments

Skip to: #130 (comment)

library(tibble)
suppressPackageStartupMessages(library(duckplyr))

df1 <- tibble(
  type = c("A", "B", "A", "B"),
  amt = c(1638210122L, 1456061134L, 1571660787L, 1571660787L)
) 

try(
  df1 |> 
    as_duckplyr_df() |>
    summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)  
)
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(INTEGER, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(INTEGER, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}


df2 <- tibble(
  type = c("A", "B", "A", "B"),
  amt = c(1638210122.2323, 1456061134.223, 1571660787.2321, 1571660787.2112)
)

try(
  df2 |> 
    as_duckplyr_df() |>
    summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)  
)
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

Created on 2024-03-22 with reprex v2.1.0

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.2 (2023-10-31)
#>  os       macOS Sonoma 14.3.1
#>  system   x86_64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Europe/Rome
#>  date     2024-03-22
#>  pandoc   3.1.8 @ /usr/local/bin/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.0)
#>  collections   0.3.7   2023-01-05 [1] CRAN (R 4.3.0)
#>  curl          5.2.0   2023-12-08 [1] CRAN (R 4.3.0)
#>  DBI           1.2.2   2024-02-16 [1] CRAN (R 4.3.2)
#>  digest        0.6.34  2024-01-11 [1] CRAN (R 4.3.0)
#>  dplyr         1.1.4   2023-11-17 [1] CRAN (R 4.3.0)
#>  duckdb        0.10.0  2024-03-13 [1] CRAN (R 4.3.2)
#>  duckplyr    * 0.3.2   2024-03-17 [1] CRAN (R 4.3.2)
#>  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.0)
#>  fansi         1.0.6   2023-12-08 [1] CRAN (R 4.3.0)
#>  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
#>  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.0)
#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
#>  glue          1.7.0   2024-01-09 [1] CRAN (R 4.3.0)
#>  htmltools     0.5.7   2023-11-03 [1] CRAN (R 4.3.0)
#>  knitr         1.45    2023-10-30 [1] CRAN (R 4.3.0)
#>  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.0)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.3.0)
#>  R.oo          1.26.0  2024-01-24 [1] CRAN (R 4.3.2)
#>  R.utils       2.12.3  2023-11-18 [1] CRAN (R 4.3.0)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
#>  reprex        2.1.0   2024-01-11 [1] CRAN (R 4.3.0)
#>  rlang         1.1.3   2024-01-10 [1] CRAN (R 4.3.0)
#>  rmarkdown     2.25    2023-09-18 [1] CRAN (R 4.3.0)
#>  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.0)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>  styler        1.10.2  2023-08-29 [1] CRAN (R 4.3.0)
#>  tibble      * 3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>  tidyselect    1.2.1   2024-03-11 [1] CRAN (R 4.3.2)
#>  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.0)
#>  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.0)
#>  withr         3.0.0   2024-01-16 [1] CRAN (R 4.3.0)
#>  xfun          0.42    2024-02-08 [1] CRAN (R 4.3.2)
#>  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.0)
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Actually, "large numbers" are not the problem here, rather na.rm = TRUE in sum(). Probably needs to be harmonized with dplyr's behaviour.

library(tibble)

df1 <- tibble(
  type = c("A", "B", "A", "B"),
  amt = c(20, 1, NA, 1)
)

try(
  df1 |>
    duckplyr::as_duckplyr_df() |>
    duckplyr::summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)  
)
#> No duckplyr fallback reports ready for upload.
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

# NAs are dropped by default
df1 |>
  duckplyr::as_duckplyr_df() |>
  duckplyr::summarise(amt_tot = sum(amt), .by = type)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Aggregate ["type", sum(amt)]
#>   r_dataframe_scan(0x7fd8e7bd4048)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - type (VARCHAR)
#> - amt_tot (DOUBLE)
#> 
#> # A tibble: 2 × 2
#>   type  amt_tot
#>   <chr>   <dbl>
#> 1 A          20
#> 2 B           2

df1 |>
  dplyr::summarise(amt_tot = sum(amt), .by = type)
#> # A tibble: 2 × 2
#>   type  amt_tot
#>   <chr>   <dbl>
#> 1 A          NA
#> 2 B           2

Created on 2024-03-22 with reprex v2.1.0

@krlmlr, sorry for the follow-up. This latest reprex should make it clear.

I wonder whether duckplyr should beahave similarly to what I get from a connection (i.e. throwing a warning) in case of NA. As it is now we see three different cases:

  • from connection: warning and asked to set na.rm = TRUE. This reminds me or teaches me something about databases 👍🏻
  • duckplyr: na.rm = TRUE throws an error and NA are dropped by default. NA removal is consistent with what I get from connection, but if not advised one might be confused if he forgets for some reasons there's a db under the hood 🤦🏻‍♂️
  • dplyr: NA are not dropped unless one uses na.rm = TRUE
library(DBI)
library(duckdb)
suppressPackageStartupMessages(library(duckplyr))
suppressPackageStartupMessages(library(dplyr))

df1 <- tibble::tibble(
  type = c("A", "B", "A", "B"),
  amt = c(20, 1, NA, 1)
)

con <- dbConnect(duckdb(), read_only = FALSE)
dbWriteTable(con, "tbl1", df1)

tbl1 <- tbl("tbl1", src = con)

# from connection I get a warning which reminds me of the known "db feature"
tbl1 |>
  summarise(amt = sum(amt), .by = type)
#> Warning: Missing values are always removed in SQL aggregation functions.
#> Use `na.rm = TRUE` to silence this warning
#> This warning is displayed once every 8 hours.
#> # Source:   SQL [2 x 2]
#> # Database: DuckDB v0.10.0 [root@Darwin 23.3.0:R 4.3.2/:memory:]
#>   type    amt
#>   <chr> <dbl>
#> 1 A        20
#> 2 B         2

# make it silent
tbl1 |>
  summarise(amt = sum(amt, na.rm = TRUE), .by = type)
#> # Source:   SQL [2 x 2]
#> # Database: DuckDB v0.10.0 [root@Darwin 23.3.0:R 4.3.2/:memory:]
#>   type    amt
#>   <chr> <dbl>
#> 1 A        20
#> 2 B         2

withr::with_envvar(
  new = c(DUCKPLYR_FORCE = "TRUE"),
  code = {
    tbl1 |>
      collect() |>
      as_duckplyr_df() |>
      summarise(amt = sum(amt, na.rm = TRUE), .by = type)
  }
)
#> Error: {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}

tbl1 |>
  collect() |>
  as_duckplyr_df() |>
  duckplyr::summarise(amt = sum(amt), .by = type)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Aggregate ["type", sum(amt)]
#>   r_dataframe_scan(0x7fd1495425c8)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - type (VARCHAR)
#> - amt (DOUBLE)
#> 
#> # A tibble: 2 × 2
#>   type    amt
#>   <chr> <dbl>
#> 1 A        20
#> 2 B         2

df1 |> 
  dplyr::summarise(amt = sum(amt), .by = type)
#> # A tibble: 2 × 2
#>   type    amt
#>   <chr> <dbl>
#> 1 A        NA
#> 2 B         2

# expectations:
# - duckplyr 0.3.2 has the same behaviour of dplyr 
#   (very unlikely it will happen since there's always a db under the hood)
# - duckplyr 0.3.2 throws a warning as done from connection, which is not the case right now
# - duckplyr 0.3.2 supports `sum(..., na.rm = TRUE)`, does not throw an error and 
#  `na.rm = TRUE` is used to silence the warning

Created on 2024-03-22 with reprex v2.1.0

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.2 (2023-10-31)
#>  os       macOS Sonoma 14.3.1
#>  system   x86_64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Europe/Rome
#>  date     2024-03-22
#>  pandoc   3.1.8 @ /usr/local/bin/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  blob          1.2.4   2023-03-17 [1] CRAN (R 4.3.0)
#>  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.0)
#>  collections   0.3.7   2023-01-05 [1] CRAN (R 4.3.0)
#>  curl          5.2.0   2023-12-08 [1] CRAN (R 4.3.0)
#>  DBI         * 1.2.2   2024-02-16 [1] CRAN (R 4.3.2)
#>  dbplyr        2.4.0   2023-10-26 [1] CRAN (R 4.3.0)
#>  digest        0.6.34  2024-01-11 [1] CRAN (R 4.3.0)
#>  dplyr       * 1.1.4   2023-11-17 [1] CRAN (R 4.3.0)
#>  duckdb      * 0.10.0  2024-03-13 [1] CRAN (R 4.3.2)
#>  duckplyr    * 0.3.2   2024-03-17 [1] CRAN (R 4.3.2)
#>  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.0)
#>  fansi         1.0.6   2023-12-08 [1] CRAN (R 4.3.0)
#>  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
#>  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.0)
#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
#>  glue          1.7.0   2024-01-09 [1] CRAN (R 4.3.0)
#>  htmltools     0.5.7   2023-11-03 [1] CRAN (R 4.3.0)
#>  knitr         1.45    2023-10-30 [1] CRAN (R 4.3.0)
#>  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.0)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.3.0)
#>  R.oo          1.26.0  2024-01-24 [1] CRAN (R 4.3.2)
#>  R.utils       2.12.3  2023-11-18 [1] CRAN (R 4.3.0)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
#>  reprex        2.1.0   2024-01-11 [1] CRAN (R 4.3.0)
#>  rlang         1.1.3   2024-01-10 [1] CRAN (R 4.3.0)
#>  rmarkdown     2.25    2023-09-18 [1] CRAN (R 4.3.0)
#>  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.0)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>  styler        1.10.2  2023-08-29 [1] CRAN (R 4.3.0)
#>  tibble        3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>  tidyselect    1.2.1   2024-03-11 [1] CRAN (R 4.3.2)
#>  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.0)
#>  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.0)
#>  withr         3.0.0   2024-01-16 [1] CRAN (R 4.3.0)
#>  xfun          0.42    2024-02-08 [1] CRAN (R 4.3.2)
#>  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.0)
#> 
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Good catch. We should fall back to base R for sum(na.rm = FALSE), which is the default, and accept na.rm = TRUE . Perhaps we can also fix this straight away by implementing an extension function.

All other aggregate functions with an na.rm argument are affected too.

CC @romainfrancois.