Default `epsilon` value in mass imputation
Closed this issue · 0 comments
Kertoo commented
By default when calling controlOut
epsilon value is set to 1e-6
which is a sensible choice FOR stats::glm
but this epsilon
value is also used for fitting RANN::nn2()
. The default eps
in RANN::nn2()
is set to 0
, this is because when eps = 0
RANN::nn2
performs exact nearest neighbour search but when eps > 0
and appropriate solution is used.
This leads to worse results (as presented in code bellow) in predictive mean matching (ditto for kNN imputation) and can be confusing and users would probably expect the default to be set to an exact solution
library(nonprobsvy)
set.seed(1234567890)
N <- 1e5 ## 1000000
n <- 100
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
population <- data.frame(
x1,
x2,
y1 = 1 + x1 + x2 + epsilon,
y2 = 0.5*(x1 - 0.5)^2 + x2 + epsilon,
p1 = exp(x2)/(1+exp(x2)),
p2 = exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2)),
base_w_srs = N/n,
flag_bd1 = rbinom(n = N, size = 1, prob = p1),
flag_srs = as.numeric(1:N %in% sample(1:N, size = n))
)
base_w_bd <- N/sum(population$flag_bd1)
sample_prob <- svydesign(ids= ~1, weights = ~ base_w_srs,
data = subset(population, flag_srs == 1))
pmm1 <- nonprob(
outcome = y1 ~ x1 + x2,
data = population[population$flag_bd1 == 1, , drop = FALSE],
svydesign = sample_prob,
method_outcome = "pmm",
family_outcome = "gaussian",
control_outcome = controlOut(k=1)
)
pmm2 <- nonprob(
outcome = y2 ~ x1 + x2,
data = population[population$flag_bd1 == 1, , drop = FALSE],
svydesign = sample_prob,
method_outcome = "pmm",
family_outcome = "gaussian",
control_outcome = controlOut(k=1)
)
# manually
df_prob <- population[population$flag_srs == 1, ]
df_non <- population[population$flag_bd1 == 1, ]
lm1 <- lm(y1 ~ x1 + x2, data = df_non)
lm2 <- lm(y2 ~ x1 + x2, data = df_non)
pred1 <- predict(lm1, newdata = df_prob)
pred2 <- predict(lm2, newdata = df_prob)
nn1 <- sapply(pred1, FUN = function(x) {which.min(abs(df_non$y1 - x))})
nn2 <- sapply(pred2, FUN = function(x) {which.min(abs(df_non$y2 - x))})
(est1 <- weighted.mean(x = df_non$y1[nn1],
w = df_prob$base_w_srs))
pmm1$output$mean
(est2 <- weighted.mean(x = df_non$y2[nn2],
w = df_prob$base_w_srs))
pmm2$output$mean
population[, c("y1", "y2"), drop = FALSE] |> colMeans()
xx1 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1))
# the same
xx11 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1), search = "linear")
xx111 <- RANN::nn2(data = cbind(df_non$y1), query = cbind(pred1), k = 1, eps = 0)
cbind(nn1, xx1$id, pmm1$outcome$y1$model_rand$nn.idx, xx111$nn.idx)