mlampros/IceSat2R

Bug in `time_specific_orbits`: `Class attribute on column 3 of item 96 does not match with column 3 of item 1.`

evetion opened this issue · 10 comments

From the documentation here, I ran

> start_date_w = "2020-12-15"
> end_date_w = "2021-02-15"
> 
> rgt_winter = time_specific_orbits(date_from = start_date_w,
+                                   date_to = end_date_w,
+                                   RGT_cycle = NULL,
+                                   download_method = 'curl',
+                                   threads = parallel::detectCores(),
+                                   verbose = TRUE)
The available Icesat-2 orbits will be red from 'https://icesat-2.gsfc.nasa.gov/science/specs' ... 
Access the data of the technical specs website ...
Extract the .zip files and the corresponding titles ...
Keep the relevant data from the url's and titles ...
Process the nominal and time specific orbits separately ...
Adjust the Dates of the time specific orbits ...
Create the nominal orbits data.table ...
Create the time specific orbits data.table ...
Return a single data.table ...
Elapsed time: 0 hours and 0 minutes and 0 seconds. 
ICESAT-2 orbits: 'Earliest-Date' is '2018-10-13'  'Latest-Date' is '2022-09-20' 
-----------------------------------------------------
The .zip file of 'RGT_cycle_9' will be downloaded ... 
-----------------------------------------------------
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  140M  100  140M    0     0  5630k      0  0:00:25  0:00:25 --:--:-- 2878k
The downloaded .zip file will be extracted in the '/var/folders/w_/152mmjvx4cx7y55ky886tp1h0000gn/T//RtmpukLAeZ/RGT_cycle_9' directory ... 
Download and unzip the RGT_cycle_9 .zip file: Elapsed time: 0 hours and 0 minutes and 27 seconds. 
138 .kml files will be processed ... 
Parallel processing of 138 .kml files using  10  threads starts ... 
Error in data.table::rbindlist(inner_obj) : 
  Class attribute on column 3 of item 96 does not match with column 3 of item 1.

RGT_cycle_9_sample_files.zip

The easiest way to debug this error is to run specific lines of the "time_specific_orbits" function and observe which sublist does not have the same dimensions compared to the remaining (the attached .zip file includes 2 .kml files of RGT cycle 9),

setwd('directory where the sample .kml files are saved')
vec_files = c('IS2_RGT_0001_cycle9_24-Sep-2020.kml', 'IS2_RGT_0002_cycle9_24-Sep-2020.kml')
if (!any(c(file.exists(vec_files[1]), file.exists(vec_files[2])))) stop("The saved .kml files are not in the correct directory!")

require(doParallel)
require(utils)
require(sf)
require(glue)
require(foreach)
require(data.table)


threads = 4
LEN_subset = 2
verbose = TRUE


if (threads > 1 & LEN_subset > 1) {
  if (verbose) cat(glue::glue("Parallel processing of {LEN_subset} .kml files using  {threads}  threads starts ..."), '\n')

  if (.Platform$OS.type == "unix") {
    doParallel::registerDoParallel(cores = threads)
  }

  if (.Platform$OS.type == "windows") {
    cl = parallel::makePSOCKcluster(threads)
    doParallel::registerDoParallel(cl = cl)
  }
}

sf_objs = list()

for (idx in 1:LEN_subset) {
  if (verbose) {
    message("FILE: ", idx, "/", LEN_subset, "\r", appendLF = FALSE)
    utils::flush.console()
  }

  FILE = vec_files[idx]
  iter_file = vec_files[idx]
  lrs = sf::st_layers(dsn = iter_file)
  NAMS = lrs$name
  if (length(NAMS) == 0) stop(glue::glue("The file '{FILE}' does not include any layers!"), call. = F)
  nam_file = gsub('.kml', '', FILE)
  LEN_nams = length(NAMS)

  if (threads > 1 & LEN_subset > 1) {
    inner_obj = foreach::foreach(i = 1:LEN_nams) %dopar% {
      LAYER = NAMS[i]
      lr_dat = sf::st_read(dsn = iter_file, layer = LAYER, quiet = T)         # I expect each .kml file to consist of a single 'sfc_POINT'
      lr_dat
    }
  }
  else {
    inner_obj = lapply(1:LEN_nams, function(x) {
      LAYER = NAMS[x]
      lr_dat = sf::st_read(dsn = iter_file, layer = LAYER, quiet = T)         # I expect each .kml file to consist of a single 'sfc_POINT'
      lr_dat
    })
  }

  # class_obj = as.vector(unlist(lapply(inner_obj, function(x) class(sf::st_geometry(x))[1])))                      # I expect each sublist to be of type "sfc_POINT" and normally observations which are not (and can be for instance "sfc_LINESTRING") won't have a description column either. Therefore use the next line for removal
  descr_not_idx = which(as.vector(unlist(lapply(inner_obj, function(x) is.na(x$description) | x$description == ""))))
  LEN_exc = length(descr_not_idx)
  if (LEN_exc > 0) {
    if (verbose) message(glue::glue("{LEN_exc} output sublist did not have a valid 'description' and will be removed!"))
    inner_obj = inner_obj[-descr_not_idx]
  }

  inner_obj = sf::st_as_sf(data.table::rbindlist(inner_obj))
  sf_objs[[glue::glue("{nam_file}")]] = inner_obj
}

str(sf_objs)

# List of 2
# $ IS2_RGT_0001_cycle9_24-Sep-2020:Classes ‘sf’, ‘data.table’ and 'data.frame':	95 obs. of  12 variables:
#   ..$ Name        : chr [1:95] "" "" "" "" ...
# ..$ description : chr [1:95] "RGT 1\n24-Sep-2020 05:39:06\nDOY 268\nCycle 9" "RGT 1\n24-Sep-2020 05:40:06\nDOY 268\nCycle 9" "RGT 1\n24-Sep-2020 05:41:06\nDOY 268\nCycle 9" "RGT 1\n24-Sep-2020 05:42:06\nDOY 268\nCycle 9" ...
# ..$ timestamp   : POSIXct[1:95], format: NA NA NA NA ...
# ..$ begin       : POSIXct[1:95], format: NA NA NA NA ...
# ..$ end         : POSIXct[1:95], format: NA NA NA NA ...
# ..$ altitudeMode: chr [1:95] "clampToGround" "clampToGround" "clampToGround" "clampToGround" ...
# ..$ tessellate  : int [1:95] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ extrude     : int [1:95] 0 0 0 0 0 0 0 0 0 0 ...
# ..$ visibility  : int [1:95] 1 1 1 1 1 1 1 1 1 1 ...
# ..$ drawOrder   : int [1:95] NA NA NA NA NA NA NA NA NA NA ...
# ..$ icon        : chr [1:95] NA NA NA NA ...
# ..$ geometry    :sfc_POINT of length 95; first list element:  'XYZ' num [1:3] -0.132 0.028 1
# ..- attr(*, "sf_column")= chr "geometry"
# ..- attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
# .. ..- attr(*, "names")= chr [1:11] "Name" "description" "timestamp" "begin" ...
# $ IS2_RGT_0002_cycle9_24-Sep-2020:Classes ‘sf’, ‘data.table’ and 'data.frame':	95 obs. of  12 variables:
#   ..$ Name        : chr [1:95] "" "" "" "" ...
# ..$ description : chr [1:95] "RGT 2\n24-Sep-2020 07:13:23\nDOY 268\nCycle 9" "RGT 2\n24-Sep-2020 07:14:23\nDOY 268\nCycle 9" "RGT 2\n24-Sep-2020 07:15:23\nDOY 268\nCycle 9" "RGT 2\n24-Sep-2020 07:16:23\nDOY 268\nCycle 9" ...
# ..$ timestamp   : POSIXct[1:95], format: NA NA NA NA ...
# ..$ begin       : POSIXct[1:95], format: NA NA NA NA ...
# ..$ end         : POSIXct[1:95], format: NA NA NA NA ...
# ..$ altitudeMode: chr [1:95] "clampToGround" "clampToGround" "clampToGround" "clampToGround" ...
# ..$ tessellate  : int [1:95] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
# ..$ extrude     : int [1:95] 0 0 0 0 0 0 0 0 0 0 ...
# ..$ visibility  : int [1:95] 1 1 1 1 1 1 1 1 1 1 ...
# ..$ drawOrder   : int [1:95] NA NA NA NA NA NA NA NA NA NA ...
# ..$ icon        : chr [1:95] NA NA NA NA ...
# ..$ geometry    :sfc_POINT of length 95; first list element:  'XYZ' num [1:3] -23.74811 0.00707 1
# ..- attr(*, "sf_column")= chr "geometry"
# ..- attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
# .. ..- attr(*, "names")= chr [1:11] "Name" "description" "timestamp" "begin" ...

sf_objs_dtbl = sf::st_as_sf(data.table::rbindlist(sf_objs))
sf_objs_dtbl

# Simple feature collection with 190 features and 11 fields
# Geometry type: POINT
# Dimension:     XYZ
# Bounding box:  xmin: -0.1318472 ymin: 0.02795893 xmax: -0.1318472 ymax: 0.02795893
# z_range:       zmin: 1 zmax: 1
# CRS:           4326
# First 10 features:
# Name                                   description timestamp begin  end  altitudeMode tessellate extrude visibility drawOrder icon                       geometry
# 1       RGT 1\n24-Sep-2020 05:39:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-0.1318472 0.02795...
# 2       RGT 1\n24-Sep-2020 05:40:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-0.5484491 3.86669...
# 3       RGT 1\n24-Sep-2020 05:41:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-0.9340453 7.70776...
# 4       RGT 1\n24-Sep-2020 05:42:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-1.322114 11.54864 1)
# 5       RGT 1\n24-Sep-2020 05:43:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA>  POINT Z (-1.71399 15.38886 1)
# 6       RGT 1\n24-Sep-2020 05:44:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-2.111149 19.22799 1)
# 7       RGT 1\n24-Sep-2020 05:45:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-2.515282 23.06563 1)
# 8       RGT 1\n24-Sep-2020 05:46:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA>  POINT Z (-2.928375 26.9014 1)
# 9       RGT 1\n24-Sep-2020 05:47:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-3.352826 30.73494 1)
# 10      RGT 1\n24-Sep-2020 05:48:06\nDOY 268\nCycle 9      <NA>  <NA> <NA> clampToGround         -1       0          1        NA <NA> POINT Z (-3.791596 34.56594 1)

Each sublist consists of a data.table with 95 rows and 12 columns and once I rbind these I receive a simple features object with 190 rows and 11 columns. Do you receive the same output objects for these 2 sample files?

That's a lot of code to debug something simple? Anyway, with your examples, I get the same error. The error stems from item 96 in inner_obj, which is a Line, instead of Point like the others:

[[95]]
Simple feature collection with 1 feature and 2 fields
Geometry type: POINT
Dimension:     XYZ
Bounding box:  xmin: -23.63926 ymin: -1.081025 xmax: -23.63926 ymax: -1.081025
z_range:       zmin: 1 zmax: 1
Geodetic CRS:  WGS 84
  Name                                Description                       geometry
1      RGT 1 24-Sep-2020 07:13:06 DOY 268 Cycle 9 POINT Z (-23.63926 -1.08102...

[[96]]
Simple feature collection with 1 feature and 2 fields
Geometry type: LINESTRING
Dimension:     XYZ
Bounding box:  xmin: -179.9708 ymin: -88.0062 xmax: 179.9952 ymax: 88.00649
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
   Name Description                       geometry
1 RGT 1             LINESTRING Z (-0.1318472 0....

can you install the latest version using

remotes::install_github('mlampros/IceSat2R')

and re-try your example. It seems to me that I had a similar issue during the binder configuration when one of the R objects (sublists) were not of the same type (but this was in the first version of the package)

I got the same error with the latest version.

normally the next code snippet should have shown a message regarding the mismatch in dimensions,

  descr_not_idx = which(as.vector(unlist(lapply(inner_obj, function(x) is.na(x$description) | x$description == ""))))
  LEN_exc = length(descr_not_idx)
  if (LEN_exc > 0) {
    if (verbose) message(glue::glue("{LEN_exc} output sublist did not have a valid 'description' and will be removed!"))
    inner_obj = inner_obj[-descr_not_idx]
  }

before the

inner_obj = sf::st_as_sf(data.table::rbindlist(inner_obj))

which throws the error

Is it possible that you create a new variable for the sublist with index 96 (of "inner_obj") and show me what you receive in the console using,

subs = inner_obj[[96]]
str(subs)
subs$description

I would expect in case of a "LINESTRING" and not a "POINT" sf object that it would be removed (the "description" is either NA or an empty string)

> subs = inner_obj[[96]]
> subs
Simple feature collection with 1 feature and 2 fields
Geometry type: LINESTRING
Dimension:     XYZ
Bounding box:  xmin: -179.9708 ymin: -88.0062 xmax: 179.9952 ymax: 88.00649
z_range:       zmin: 0 zmax: 0
Geodetic CRS:  WGS 84
   Name Description                       geometry
1 RGT 1             LINESTRING Z (-0.1318472 0....
> str(subs)
Classessfand 'data.frame':	1 obs. of  3 variables:
 $ Name       : chr "RGT 1"
 $ Description: chr ""
 $ geometry   :sfc_LINESTRING of length 1; first list element:  'XYZ' num [1:5656, 1:3] -0.132 -0.138 -0.145 -0.151 -0.157 ...
 - attr(*, "sf_column")= chr "geometry"
 - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA
  ..- attr(*, "names")= chr [1:2] "Name" "Description"
> subs$description
NULL

So, on closer inspection, there's a case sensitivity bug. I'm not sure where this stems from (I'm not a R nor KML user), but this is actually what you expect right?

> subs$Description
[1] ""

I'm wondering why this happens. We use the same ICESat-2 data but once you upload the data the 'description' column name appears with an uppercase 'D' in your case (as 'Description').

I'm on a M1 Mac, although my filesystem isn't case sensitive, this might've something to do with it? Normally you would catch such platform issues with CI.

I've added a test case to account for this issue on Mac osx, however the .tic configuration for Windows and Mac osx fails since yesterday due to a "mapview" and "terra" configuration issue (it's not related to the added test case). I have to wait a few days to see if the issue resolves on its own (I tried to add a few dependencies but it didn't work)

It seems that the error case in tic was not fixed (for the Mac OSx). I've set-up mac os, R and the required for the IceSat2R packages to fix and test this issue. I reproduced the issue and modified the code and now the function works as expected.
The updated code can be tested (by any user) after configuring the geospatial packages. The related to this issue functions are "time_specific_orbits()" and "vsi_time_specific_orbits_wkt()" and can be tested using the following code snippet:

require(IceSat2R)
require(testthat)

testthat::test_that("the function 'time_specific_orbits()' returns the expected output for an input date range", {

  approx_date_start = "2021-02-01"
  approx_date_end = "2021-02-15"

  res_rgt_many = time_specific_orbits(date_from = approx_date_start,
                                      date_to = approx_date_end,
                                      RGT_cycle = NULL,
                                      download_method = 'curl',
                                      threads = 1,
                                      verbose = FALSE)

  testthat::expect_true( inherits(res_rgt_many, c("sf", "data.table", "data.frame")) & nrow(res_rgt_many) > 0 & all(c("Name", "RGT", "Date_time", "day_of_year", "cycle", "geometry") %in% colnames(res_rgt_many)))
})

testthat::test_that("the function 'vsi_time_specific_orbits_wkt()' returns the expected output for an input WKT!", {

  date_start = '2020-01-01'
  date_end = '2020-01-01'
  sample_rgts = c("24", "32")
  WKT = 'POLYGON ((-14.765 18.979, -11.25 18.979, -11.25 21.943, -14.765 21.943, -14.765 18.979))'

  orb_cyc_single = vsi_time_specific_orbits_wkt(date_from = date_start,
                                                date_to = date_end,
                                                RGTs = sample_rgts,
                                                wkt_filter = WKT,
                                                verbose = FALSE)

  testthat::expect_true(inherits(orb_cyc_single, c("sf", "data.table", "data.frame")) & nrow(orb_cyc_single) == 2 & all(c("Name", "RGT", "Date_time", "day_of_year", "cycle", "geometry") %in% colnames(orb_cyc_single)))
})

I'll upload the updated version of the package to CRAN in a few days.
I'll close the issue. Feel free to re-open it if something does not work as expected.