Elaborate on map(.id = vars)
Closed this issue · 2 comments
wlandau commented
- Choosing ID variables.
- Advantages and dangers of
map(.id = FALSE)
. - How to define custom independent IDs for targets.
wlandau commented
Problem: when you add new targets, sometimes it disrupts the indexing and invalidates other targets.
library(drake)
dataset <- c("census", "worldbank")
drake_plan(
x = target(
analyze_data(data),
transform = map(data = !!dataset, .id = FALSE)
)
)
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x analyze_data("census")
#> 2 x_2 analyze_data("worldbank")
# Add new data
dataset <- c("census", "gapminder", "worldbank")
# The World Bank analysis changed from x_2 to x_3,
# which invalidates the target.
drake_plan(
x = target(
analyze_data(data),
transform = map(data = !!dataset, .id = FALSE)
)
)
#> # A tibble: 3 x 2
#> target command
#> <chr> <expr>
#> 1 x analyze_data("census")
#> 2 x_2 analyze_data("gapminder")
#> 3 x_3 analyze_data("worldbank")
Created on 2019-02-27 by the reprex package (v0.2.1)
Solution: create your own id. Below, the World Bank target is always x_b9e5ea49
.
library(drake)
library(tidyverse)
my_id <- function(x) {
map_chr(x, digest::digest, algo = "murmur32", serialize = FALSE) %>%
rlang::syms()
}
dataset <- c("census", "worldbank")
id_vals <- my_id(dataset)
drake_plan(
x = target({
quote(id)
analyze_data(data)
},
transform = map(data = !!dataset, id = !!id_vals, .id = id)
)
)
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x_365d5563 { quote(`365d5563`) analyze_data("census") }
#> 2 x_b9e5ea49 { quote(b9e5ea49) analyze_data("worldbank") }
dataset <- c("census", "gapminder", "worldbank")
id_vals <- my_id(dataset)
drake_plan(
x = target({
quote(id)
analyze_data(data)
},
transform = map(data = !!dataset, id = !!id_vals, .id = id)
)
)
#> # A tibble: 3 x 2
#> target command
#> <chr> <expr>
#> 1 x_365d5563 { quote(`365d5563`) analyze_data("census") }
#> 2 x_3d724852 { quote(`3d724852`) analyze_data("gapminder") }
#> 3 x_b9e5ea49 { quote(b9e5ea49) analyze_data("worldbank") }
Created on 2019-02-27 by the reprex package (v0.2.1)
wlandau commented
So then why is this not already the default behavior of map(.id = FALSE)
? Because:
- Hash suffixes are cryptic and ugly.
- An automated solution would not completely solve the problem. If your grid of grouping variables has duplicates, we need still need
drake:::make_unique()
to add suffixes_2
,_3
, etc.
library(drake)
library(tidyverse)
my_id <- function(x) {
map_chr(x, digest::digest, algo = "murmur32", serialize = FALSE) %>%
rlang::syms()
}
y_vals <- c(1, 1)
id_vals <- my_id(as.character(y_vals))
drake_plan(x = target({
id
y
},
transform = map(y = !!y_vals, id = !!id_vals, .id = id))
)
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x_9416ac93 { `9416ac93` 1 }
#> 2 x_9416ac93_2 { `9416ac93` 1 }
Created on 2019-02-28 by the reprex package (v0.2.1)