Order preservation in `distinct()` wrong in corner cases
Closed this issue · 0 comments
krlmlr commented
This is an interesting one. Removing or adding (!) triplets of R1+R3+R2 and R4+R5+R6 changes the result.
options(conflicts.policy = list(warn = FALSE))
library(duckplyr)
Sys.setenv("DUCKPLYR_OUTPUT_ORDER" = "TRUE")
data <-
data.frame(
row_cluster = rep(1:2, each = 26L),
row_name = c(
"R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1",
"R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3",
"R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4",
"R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5",
NULL
)
)
data |>
duckplyr::as_duckplyr_df() %>%
distinct(row_cluster, row_name)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Projection [row_cluster as row_cluster, row_name as row_name]
#> Order [___row_number ASC]
#> Filter [==(___row_number_by, 1)]
#> Projection [row_cluster as row_cluster, row_name as row_name, ___row_number as ___row_number, row_number() OVER (PARTITION BY row_cluster, row_name) as ___row_number_by]
#> Projection [row_cluster as row_cluster, row_name as row_name, row_number() OVER () as ___row_number]
#> r_dataframe_scan(0x1187a9f48)
#>
#> ---------------------
#> -- Result Columns --
#> ---------------------
#> - row_cluster (INTEGER)
#> - row_name (VARCHAR)
#>
#> row_cluster row_name
#> 1 1 R1
#> 2 1 R3
#> 3 1 R2
#> 4 2 R5
#> 5 2 R6
#> 6 2 R4
data |>
distinct(row_cluster, row_name)
#> row_cluster row_name
#> 1 1 R1
#> 2 1 R3
#> 3 1 R2
#> 4 2 R4
#> 5 2 R5
#> 6 2 R6
Created on 2023-11-12 with reprex v2.0.2