fill_na
Opened this issue · 6 comments
Deleted user commented
Have fill_na
in from_json()
so if a column doesn't exist when running simplify_dataframe()
, it creates it and fills the remaining ones with NA.
df <- structure(list(c = c(123, NA), d = c(456, NA), f = c(NA, "cats"
+ )), class = "data.frame", row.names = c(NA, -2L))
df
# c d f
# 1 123 456 <NA>
# 2 NA NA cats
Deleted user commented
first iteration
js <- '[{"x":1,"z":1},{"x":2,"y":1},{"y":2},{"z":3.3}]'
from_json( js )
# x z y
# 1 1 1.0 NULL
# 2 2 NA 1
# 3 NA NA 2
# 4 NA 3.3 NULL
need those NULL
s to be NA
s
Deleted user commented
more examples
js <- '[{"x":1,"y":1},{"y":"hello"},{"z":true}]'
res <- from_json( js )
res
# x y z
# 1 1 1 NA
# 2 NA hello NA
# 3 NA <NA> TRUE
str( res )
# 'data.frame': 3 obs. of 3 variables:
# $ x: int 1 NA NA
# $ y: chr "1" "hello" NA
# $ z: logi NA NA TRUE
js <- '[{"x":1,"y":[1,2]},{"x":1,"y":[1,3,3]},{"x":3}]'
res <- from_json( js )
res
# x y
# 1 1 1, 2
# 2 1 1, 3, 3
# 3 3 NA
str( res )
# 'data.frame': 2 obs. of 2 variables:
# $ x: int 1 1
# $ y:List of 2
# ..$ : int 1 2
# ..$ : int 1 3 3
# ..$ : int NA
Deleted user commented
js <- '[{"x":1,"y":[1,2,3]},{"x":{},"y":[1,3,3]}]'
res <- from_json( js, na_fill = T )
res
str( res )
# 'data.frame': 2 obs. of 2 variables:
# $ x:List of 2
# ..$ : int 1
# ..$ : NULL
# $ y:List of 2
# ..$ : int 1 2 3
# ..$ : int 1 3 3
y
should be a matrix? (to be consistent with na_fill = FALSE
)
Deleted user commented
duplicate keys?
js <- '[{"id":1,"val":"a","val":1},{"id":2,"val":"b"}]'
from_json( js , na_fill = TRUE )
# id val
# 1 1 a
# 2 2 b
But this is different behaviour to
js <- '[{"id":1,"val":"a","val":1},{"id":2,"val":"b"}]'
from_json( js )
# [[1]]
# [[1]]$id
# [1] 1
# [[1]]$val
# [1] "a"
# [[1]]$val
# [1] 1
# [[2]]
# [[2]]$id
# [1] 2
# [[2]]$val
# [1] "b"
I've added this difference to the docs, but I'm not sure if it's right yet...
reference: https://stackoverflow.com/a/23195243/5977215
dcooley commented
benchmarks
rm(list=ls()); gc()
set.seed(20191012)
n <- 1e5
df <- data.frame(
id = 1:n
, value = sample(letters, size = n, replace = T)
, val2 = rnorm(n = n)
, log = sample(c(T,F), size = n, replace = T)
#, dte = sample(seq(as.Date("2018-01-01"), as.Date("2018-01-31"), length.out = n), size = n, replace = T)
)
df[sample(1:n, size = n / 3), 'id'] <- NA_integer_
df[sample(1:n, size = n / 3), 'val2'] <- NA_real_
df[sample(1:n, size = n / 3), 'log'] <- NA
js <- jsonlite::toJSON( df )
microbenchmark(
jsonify = {
jfy <- from_json( js, na_fill = TRUE )
},
jsonlite = {
jlt <- jsonlite::fromJSON( js )
},
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# jsonify 798.3926 1079.845 1217.471 1348.545 1372.726 1487.848 5
# jsonlite 976.6249 1043.940 1190.661 1056.836 1416.410 1459.494 5
dcooley commented
x <- rnorm(1e6)
y <- rnorm(1e6)
z <- rnorm(1e6)
m <- matrix(rnorm(1e6), ncol = 2)
l <- sample(letters, 1e6, replace = T)
l <- list(x, m, list(y, list(z)), l)
js <- to_json( l )
jfy <- from_json( js )
jlt <- jsonlite::fromJSON( js )
str( jfy )
str( jlt )
library(microbenchmark)
microbenchmark(
jfy = {
jsonify::from_json( js )
},
jlt = {
jsonlite::fromJSON( js )
},
times = 2
)
# Unit: seconds
# expr min lq mean median uq max neval
# jfy 3.077851 3.077851 3.303551 3.303551 3.52925 3.52925 2
# jlt 16.732796 16.732796 16.938016 16.938016 17.14324 17.14324 2