SymbolixAU/jsonify

fill_na

Opened this issue · 6 comments

Have fill_na in from_json() so if a column doesn't exist when running simplify_dataframe(), it creates it and fills the remaining ones with NA.

df <- structure(list(c = c(123, NA), d = c(456, NA), f = c(NA, "cats"
+ )), class = "data.frame", row.names = c(NA, -2L))
df
#    c   d    f
# 1 123 456 <NA>
# 2  NA  NA cats

first iteration

js <- '[{"x":1,"z":1},{"x":2,"y":1},{"y":2},{"z":3.3}]'
from_json( js )
#    x   z    y
# 1  1 1.0 NULL
# 2  2  NA    1
# 3 NA  NA    2
# 4 NA 3.3 NULL

need those NULLs to be NAs

more examples

js <- '[{"x":1,"y":1},{"y":"hello"},{"z":true}]'
res <- from_json( js )
res
#    x     y    z
# 1  1     1   NA
# 2 NA hello   NA
# 3 NA  <NA> TRUE

str( res )
# 'data.frame':	3 obs. of  3 variables:
# $ x: int  1 NA NA
# $ y: chr  "1" "hello" NA
# $ z: logi  NA NA TRUE

js <- '[{"x":1,"y":[1,2]},{"x":1,"y":[1,3,3]},{"x":3}]'
res <- from_json( js )
res
#   x       y
# 1 1    1, 2
# 2 1 1, 3, 3
# 3 3      NA

str( res )
# 'data.frame':	2 obs. of  2 variables:
# $ x: int  1 1
# $ y:List of 2
# ..$ : int  1 2
# ..$ : int  1 3 3
# ..$ : int NA
js <- '[{"x":1,"y":[1,2,3]},{"x":{},"y":[1,3,3]}]'
res <- from_json( js, na_fill = T )
res

str( res )
# 'data.frame':	2 obs. of  2 variables:
#   $ x:List of 2
# ..$ : int 1
# ..$ : NULL
# $ y:List of 2
# ..$ : int  1 2 3
# ..$ : int  1 3 3

y should be a matrix? (to be consistent with na_fill = FALSE)

duplicate keys?

js <- '[{"id":1,"val":"a","val":1},{"id":2,"val":"b"}]'
from_json( js , na_fill = TRUE )
#   id val
# 1  1   a
# 2  2   b

But this is different behaviour to

js <- '[{"id":1,"val":"a","val":1},{"id":2,"val":"b"}]'
from_json( js )

# [[1]]
# [[1]]$id
# [1] 1

# [[1]]$val
# [1] "a"

# [[1]]$val
# [1] 1


# [[2]]
# [[2]]$id
# [1] 2

# [[2]]$val
# [1] "b"

I've added this difference to the docs, but I'm not sure if it's right yet...

reference: https://stackoverflow.com/a/23195243/5977215

benchmarks

rm(list=ls()); gc()
set.seed(20191012)
n <- 1e5
df <- data.frame(
  id = 1:n
  , value = sample(letters, size = n, replace = T)
  , val2 = rnorm(n = n)
  , log = sample(c(T,F), size = n, replace = T)
  #, dte = sample(seq(as.Date("2018-01-01"), as.Date("2018-01-31"), length.out = n), size = n, replace = T)
)

df[sample(1:n, size = n / 3), 'id'] <- NA_integer_
df[sample(1:n, size = n / 3), 'val2'] <- NA_real_
df[sample(1:n, size = n / 3), 'log'] <- NA

js <- jsonlite::toJSON( df )

microbenchmark(
  jsonify = {
    jfy <- from_json( js, na_fill = TRUE )
  },
  jsonlite = {
    jlt <- jsonlite::fromJSON( js )  
  },
  times = 5
)

# Unit: milliseconds
#      expr      min       lq     mean   median       uq      max neval
#  jsonify 798.3926 1079.845 1217.471 1348.545 1372.726 1487.848     5
# jsonlite 976.6249 1043.940 1190.661 1056.836 1416.410 1459.494     5
x <- rnorm(1e6)
y <- rnorm(1e6)
z <- rnorm(1e6)
m <- matrix(rnorm(1e6), ncol = 2)
l <- sample(letters, 1e6, replace = T)
l <- list(x, m, list(y, list(z)), l)

js <- to_json( l )

jfy <- from_json( js )
jlt <- jsonlite::fromJSON( js )

str( jfy )
str( jlt )

library(microbenchmark)

microbenchmark(
  jfy = {
    jsonify::from_json( js )
  },
  jlt = {
    jsonlite::fromJSON( js )
  },
  times = 2
)

# Unit: seconds
# expr       min        lq      mean    median       uq      max neval
# jfy  3.077851  3.077851  3.303551  3.303551  3.52925  3.52925     2
# jlt 16.732796 16.732796 16.938016 16.938016 17.14324 17.14324     2