complete overlaps in each group variable

Question

complete overlaps in each group variable

Closed this issue 2 years ago · 1 comments

Hi, I share the input and output so you can run it easily.
Input:


df = data.frame(
group = c("A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B"),
year = c(2016,2016,2016,2016,2017,2017,2017,2017,2017,2016,2016,2016,2017,2017,2017,2017),
time_left = c("2016-01-01","2016-12-01","2016-12-13","2016-12-16","2017-01-01","2017-01-01","2017-01-01","2017-01-04","2017-01-04",
"2016-01-01","2016-01-10","2016-10-10","2017-01-01","2017-01-01","2017-03-03","2017-05-22"),
time_right = c("2016-12-31","2016-12-31","2016-12-31","2016-12-31","2017-06-28","2017-08-11","2017-12-31","2017-03-31","2017-04-21",
"2016-01-25","2016-01-20","2016-12-31","2017-01-01","2017-03-16","2017-05-12","2017-06-27"), 
x = c( 30,100,20,NA,20,1,0,12,NA,NA,NA,10,10,1000,20,NA))

My desired output, removes those records that their time intervals are fully overlapped in other records' time intervals in the same group. Also, returns the max(x) for those overlapped time intervals even if one of the overlapped time intervals has X = NA. If X for all overlapped time intervals is NA, then it will return NA.

the output of this data frame is:

output = data.frame(group = c("A", "A", "B", "B", "B", "B", "B"),
                    year = c(2016, 2016, 2016, 2016, 2017, 2017, 2017),
                    time_left = c("2016-01-01", "2017-01-01", "2016-01-01", "2016-10-10", "2017-01-01", "2017-03-03", "2017-05-22"), 
                    time_right = c("2016-12-31","2017-12-31", "2016-01-25", "2016-12-31", "2017-03-16", "2017-05-12", "2017-06-27"), 
                    x = c(100,20, NA, 10, 1000, 20, NA))

Thank you so much for your help.

Answer 1 · 2022-09-04T14:25:23.000Z

You can basically use the same code as in #20, but max() doesn't really work the way you want it to (when na.rm = TRUE and everything is NA, it returns -Inf) so you need an extra helper for that (we plan to add vec_min() to vctrs in r-lib/vctrs#86)

library(ivs)
library(dplyr)
library(vctrs)
library(rlang)

vec_max <- function(x, na_rm, empty) {
  if (na_rm) {
    missing <- vec_equal_na(x)
    x <- vec_slice(x, !missing)
  }
  
  empty <- vec_cast(empty, to = x)
  
  if (vec_size(x) == 0L) {
    empty
  } else {
    max(x)
  }
}

iv_locate_max_containment <- function(x, ..., missing = "equals") {
  check_dots_empty0(...)
  # No support for integer(1) `missing` here, that wouldn't make sense
  # because we don't want to allow arbitrary values. The haystack locations
  # are meaningful.
  missing <- arg_match0(missing, values = c("equals", "drop", "error"))
  
  loc_unique <- vec_unique_loc(x)
  
  needles <- vec_slice(x, loc_unique)
  haystack <- x
  
  # TODO: Catch and rethrow error when `missing = "error"` because
  # it might reference a location that isn't correct since `needles`
  # is the unique locations from `x`. Should be able to use `loc_unique`
  # to map the location in the error condition to the input location.
  
  # Find all locations where the range "contains" any other range
  # (including itself). Limited `needles` to only unique values to
  # ensure that "top" ranges are detectable (i.e. duplicates would
  # make "top" ranges appear to be contained in another container)
  locs <- iv_locate_overlaps(
    needles = needles, 
    haystack = haystack, 
    type = "contains", 
    missing = missing
  )
  
  # Find the "top" ranges, i.e. the containers that aren't contained
  # by any other containers
  top <- !vec_duplicate_detect(locs$haystack)
  top <- vec_slice(locs$needles, top)
  top <- vec_in(locs$needles, top)
  
  # Slice out only the overlaps corresponding to the "top" ranges
  locs <- vec_slice(locs, top)
  
  # Map the unique needle locations back to their input location
  locs$needles <- vec_slice(loc_unique, locs$needles)
  
  locs
}

df = data.frame(
  group = c("A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B"),
  year = c(2016,2016,2016,2016,2017,2017,2017,2017,2017,2016,2016,2016,2017,2017,2017,2017),
  time_left = c("2016-01-01","2016-12-01","2016-12-13","2016-12-16","2017-01-01","2017-01-01","2017-01-01","2017-01-04","2017-01-04",
                "2016-01-01","2016-01-10","2016-10-10","2017-01-01","2017-01-01","2017-03-03","2017-05-22"),
  time_right = c("2016-12-31","2016-12-31","2016-12-31","2016-12-31","2017-06-28","2017-08-11","2017-12-31","2017-03-31","2017-04-21",
                 "2016-01-25","2016-01-20","2016-12-31","2017-01-01","2017-03-16","2017-05-12","2017-06-27"), 
  x = c( 30,100,20,NA,20,1,0,12,NA,NA,NA,10,10,1000,20,NA))

# Setup the range
df <- df %>%
  as_tibble() %>%
  mutate(
    time_left = as.Date(time_left),
    time_right = as.Date(time_right)
  ) %>%
  # Add 1 because ranges are half-open like `[ )`
  mutate(time_right = time_right + 1L) %>%
  mutate(range = iv(time_left, time_right), .keep = "unused")

df %>%
  group_by(group) %>%
  summarise({
    locs <- iv_locate_max_containment(range)
    out <- iv_align(range, x, locations = locs)
    rename(out, range = needles, x = haystack)
  }, .groups = "drop") %>%
  group_by(group, range) %>%
  summarise(
    x = vec_max(x, na_rm = TRUE, empty = NA), 
    .groups = "drop"
  )
#> # A tibble: 7 × 3
#>   group                    range     x
#>   <chr>               <iv<date>> <dbl>
#> 1 A     [2016-01-01, 2017-01-01)   100
#> 2 A     [2017-01-01, 2018-01-01)    20
#> 3 B     [2016-01-01, 2016-01-26)    NA
#> 4 B     [2016-10-10, 2017-01-01)    10
#> 5 B     [2017-01-01, 2017-03-17)  1000
#> 6 B     [2017-03-03, 2017-05-13)    20
#> 7 B     [2017-05-22, 2017-06-28)    NA

^{Created on 2022-09-04 with reprex v2.0.2}