Implement `sum(na.rm = FALSE)`
Opened this issue · 3 comments
Skip to: #130 (comment)
library(tibble)
suppressPackageStartupMessages(library(duckplyr))
df1 <- tibble(
type = c("A", "B", "A", "B"),
amt = c(1638210122L, 1456061134L, 1571660787L, 1571660787L)
)
try(
df1 |>
as_duckplyr_df() |>
summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)
)
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(INTEGER, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(INTEGER, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}
df2 <- tibble(
type = c("A", "B", "A", "B"),
amt = c(1638210122.2323, 1456061134.223, 1571660787.2321, 1571660787.2112)
)
try(
df2 |>
as_duckplyr_df() |>
summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)
)
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}
Created on 2024-03-22 with reprex v2.1.0
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.3.2 (2023-10-31)
#> os macOS Sonoma 14.3.1
#> system x86_64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Rome
#> date 2024-03-22
#> pandoc 3.1.8 @ /usr/local/bin/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.0)
#> collections 0.3.7 2023-01-05 [1] CRAN (R 4.3.0)
#> curl 5.2.0 2023-12-08 [1] CRAN (R 4.3.0)
#> DBI 1.2.2 2024-02-16 [1] CRAN (R 4.3.2)
#> digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.0)
#> dplyr 1.1.4 2023-11-17 [1] CRAN (R 4.3.0)
#> duckdb 0.10.0 2024-03-13 [1] CRAN (R 4.3.2)
#> duckplyr * 0.3.2 2024-03-17 [1] CRAN (R 4.3.2)
#> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.0)
#> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.0)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
#> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
#> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.0)
#> htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.0)
#> knitr 1.45 2023-10-30 [1] CRAN (R 4.3.0)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
#> purrr 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.3.0)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0)
#> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.2)
#> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
#> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.3.0)
#> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.0)
#> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.0)
#> rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
#> styler 1.10.2 2023-08-29 [1] CRAN (R 4.3.0)
#> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
#> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.2)
#> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.0)
#> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.0)
#> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.0)
#> xfun 0.42 2024-02-08 [1] CRAN (R 4.3.2)
#> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.0)
#>
#> ──────────────────────────────────────────────────────────────────────────────
Actually, "large numbers" are not the problem here, rather na.rm = TRUE
in sum()
. Probably needs to be harmonized with dplyr's behaviour.
library(tibble)
df1 <- tibble(
type = c("A", "B", "A", "B"),
amt = c(20, 1, NA, 1)
)
try(
df1 |>
duckplyr::as_duckplyr_df() |>
duckplyr::summarise(amt_tot = sum(amt, na.rm = TRUE), .by = type)
)
#> No duckplyr fallback reports ready for upload.
#> Error : {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}
# NAs are dropped by default
df1 |>
duckplyr::as_duckplyr_df() |>
duckplyr::summarise(amt_tot = sum(amt), .by = type)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Aggregate ["type", sum(amt)]
#> r_dataframe_scan(0x7fd8e7bd4048)
#>
#> ---------------------
#> -- Result Columns --
#> ---------------------
#> - type (VARCHAR)
#> - amt_tot (DOUBLE)
#>
#> # A tibble: 2 × 2
#> type amt_tot
#> <chr> <dbl>
#> 1 A 20
#> 2 B 2
df1 |>
dplyr::summarise(amt_tot = sum(amt), .by = type)
#> # A tibble: 2 × 2
#> type amt_tot
#> <chr> <dbl>
#> 1 A NA
#> 2 B 2
Created on 2024-03-22 with reprex v2.1.0
@krlmlr, sorry for the follow-up. This latest reprex should make it clear.
I wonder whether duckplyr should beahave similarly to what I get from a connection (i.e. throwing a warning) in case of NA
. As it is now we see three different cases:
- from connection: warning and asked to set
na.rm = TRUE
. This reminds me or teaches me something about databases 👍🏻 - duckplyr:
na.rm = TRUE
throws an error andNA
are dropped by default.NA
removal is consistent with what I get from connection, but if not advised one might be confused if he forgets for some reasons there's a db under the hood 🤦🏻♂️ - dplyr:
NA
are not dropped unless one usesna.rm = TRUE
library(DBI)
library(duckdb)
suppressPackageStartupMessages(library(duckplyr))
suppressPackageStartupMessages(library(dplyr))
df1 <- tibble::tibble(
type = c("A", "B", "A", "B"),
amt = c(20, 1, NA, 1)
)
con <- dbConnect(duckdb(), read_only = FALSE)
dbWriteTable(con, "tbl1", df1)
tbl1 <- tbl("tbl1", src = con)
# from connection I get a warning which reminds me of the known "db feature"
tbl1 |>
summarise(amt = sum(amt), .by = type)
#> Warning: Missing values are always removed in SQL aggregation functions.
#> Use `na.rm = TRUE` to silence this warning
#> This warning is displayed once every 8 hours.
#> # Source: SQL [2 x 2]
#> # Database: DuckDB v0.10.0 [root@Darwin 23.3.0:R 4.3.2/:memory:]
#> type amt
#> <chr> <dbl>
#> 1 A 20
#> 2 B 2
# make it silent
tbl1 |>
summarise(amt = sum(amt, na.rm = TRUE), .by = type)
#> # Source: SQL [2 x 2]
#> # Database: DuckDB v0.10.0 [root@Darwin 23.3.0:R 4.3.2/:memory:]
#> type amt
#> <chr> <dbl>
#> 1 A 20
#> 2 B 2
withr::with_envvar(
new = c(DUCKPLYR_FORCE = "TRUE"),
code = {
tbl1 |>
collect() |>
as_duckplyr_df() |>
summarise(amt = sum(amt, na.rm = TRUE), .by = type)
}
)
#> Error: {"exception_type":"Binder","exception_message":"No function matches the given name and argument types 'sum(DOUBLE, BOOLEAN)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsum(DECIMAL) -> DECIMAL\n\tsum(SMALLINT) -> HUGEINT\n\tsum(INTEGER) -> HUGEINT\n\tsum(BIGINT) -> HUGEINT\n\tsum(HUGEINT) -> HUGEINT\n\tsum(DOUBLE) -> DOUBLE\n","name":"sum","candidates":"sum(DECIMAL) -> DECIMAL,sum(SMALLINT) -> HUGEINT,sum(INTEGER) -> HUGEINT,sum(BIGINT) -> HUGEINT,sum(HUGEINT) -> HUGEINT,sum(DOUBLE) -> DOUBLE","call":"sum(DOUBLE, BOOLEAN)","error_subtype":"NO_MATCHING_FUNCTION"}
tbl1 |>
collect() |>
as_duckplyr_df() |>
duckplyr::summarise(amt = sum(amt), .by = type)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Aggregate ["type", sum(amt)]
#> r_dataframe_scan(0x7fd1495425c8)
#>
#> ---------------------
#> -- Result Columns --
#> ---------------------
#> - type (VARCHAR)
#> - amt (DOUBLE)
#>
#> # A tibble: 2 × 2
#> type amt
#> <chr> <dbl>
#> 1 A 20
#> 2 B 2
df1 |>
dplyr::summarise(amt = sum(amt), .by = type)
#> # A tibble: 2 × 2
#> type amt
#> <chr> <dbl>
#> 1 A NA
#> 2 B 2
# expectations:
# - duckplyr 0.3.2 has the same behaviour of dplyr
# (very unlikely it will happen since there's always a db under the hood)
# - duckplyr 0.3.2 throws a warning as done from connection, which is not the case right now
# - duckplyr 0.3.2 supports `sum(..., na.rm = TRUE)`, does not throw an error and
# `na.rm = TRUE` is used to silence the warning
Created on 2024-03-22 with reprex v2.1.0
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.3.2 (2023-10-31)
#> os macOS Sonoma 14.3.1
#> system x86_64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Rome
#> date 2024-03-22
#> pandoc 3.1.8 @ /usr/local/bin/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> blob 1.2.4 2023-03-17 [1] CRAN (R 4.3.0)
#> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.0)
#> collections 0.3.7 2023-01-05 [1] CRAN (R 4.3.0)
#> curl 5.2.0 2023-12-08 [1] CRAN (R 4.3.0)
#> DBI * 1.2.2 2024-02-16 [1] CRAN (R 4.3.2)
#> dbplyr 2.4.0 2023-10-26 [1] CRAN (R 4.3.0)
#> digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.0)
#> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.0)
#> duckdb * 0.10.0 2024-03-13 [1] CRAN (R 4.3.2)
#> duckplyr * 0.3.2 2024-03-17 [1] CRAN (R 4.3.2)
#> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.0)
#> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.0)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
#> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
#> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.0)
#> htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.0)
#> knitr 1.45 2023-10-30 [1] CRAN (R 4.3.0)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
#> purrr 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.3.0)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0)
#> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.2)
#> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
#> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.3.0)
#> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.0)
#> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.0)
#> rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
#> styler 1.10.2 2023-08-29 [1] CRAN (R 4.3.0)
#> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
#> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.2)
#> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.0)
#> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.0)
#> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.0)
#> xfun 0.42 2024-02-08 [1] CRAN (R 4.3.2)
#> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.0)
#>
#>
#> ──────────────────────────────────────────────────────────────────────────────
Good catch. We should fall back to base R for sum(na.rm = FALSE)
, which is the default, and accept na.rm = TRUE
. Perhaps we can also fix this straight away by implementing an extension function.
All other aggregate functions with an na.rm
argument are affected too.
CC @romainfrancois.