duckdblabs/duckplyr

Order preservation in `distinct()` wrong in corner cases

Closed this issue · 0 comments

This is an interesting one. Removing or adding (!) triplets of R1+R3+R2 and R4+R5+R6 changes the result.

options(conflicts.policy = list(warn = FALSE))

library(duckplyr)

Sys.setenv("DUCKPLYR_OUTPUT_ORDER" = "TRUE")

data <-
  data.frame(
    row_cluster = rep(1:2, each = 26L),
    row_name = c(
      "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1",
      "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3", "R2", "R1", "R3",
      "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4",
      "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5", "R6", "R4", "R5",
      NULL
    )
  )

data |>
  duckplyr::as_duckplyr_df() %>%
  distinct(row_cluster, row_name)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Projection [row_cluster as row_cluster, row_name as row_name]
#>   Order [___row_number ASC]
#>     Filter [==(___row_number_by, 1)]
#>       Projection [row_cluster as row_cluster, row_name as row_name, ___row_number as ___row_number, row_number() OVER (PARTITION BY row_cluster, row_name) as ___row_number_by]
#>         Projection [row_cluster as row_cluster, row_name as row_name, row_number() OVER () as ___row_number]
#>           r_dataframe_scan(0x1187a9f48)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - row_cluster (INTEGER)
#> - row_name (VARCHAR)
#> 
#>   row_cluster row_name
#> 1           1       R1
#> 2           1       R3
#> 3           1       R2
#> 4           2       R5
#> 5           2       R6
#> 6           2       R4

data |>
  distinct(row_cluster, row_name)
#>   row_cluster row_name
#> 1           1       R1
#> 2           1       R3
#> 3           1       R2
#> 4           2       R4
#> 5           2       R5
#> 6           2       R6

Created on 2023-11-12 with reprex v2.0.2