ncn-foreigners/nonprobsvy

Default `epsilon` value in mass imputation

Closed this issue · 0 comments

By default when calling controlOut epsilon value is set to 1e-6 which is a sensible choice FOR stats::glm but this epsilon value is also used for fitting RANN::nn2(). The default eps in RANN::nn2() is set to 0, this is because when eps = 0 RANN::nn2 performs exact nearest neighbour search but when eps > 0 and appropriate solution is used.

This leads to worse results (as presented in code bellow) in predictive mean matching (ditto for kNN imputation) and can be confusing and users would probably expect the default to be set to an exact solution

library(nonprobsvy)

set.seed(1234567890)
N <- 1e5 ## 1000000
n <- 100
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
population <- data.frame(
  x1,
  x2,
  y1 = 1 + x1 + x2 + epsilon,
  y2 = 0.5*(x1 - 0.5)^2 + x2 + epsilon,
  p1 = exp(x2)/(1+exp(x2)),
  p2 = exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2)),
  base_w_srs = N/n, 
  flag_bd1 = rbinom(n = N, size = 1, prob = p1), 
  flag_srs = as.numeric(1:N %in% sample(1:N, size = n))
)
base_w_bd <- N/sum(population$flag_bd1)
sample_prob <- svydesign(ids= ~1, weights = ~ base_w_srs,
                         data = subset(population, flag_srs == 1))

pmm1 <- nonprob(
  outcome = y1 ~ x1 + x2,
  data = population[population$flag_bd1 == 1, , drop = FALSE],
  svydesign = sample_prob,
  method_outcome = "pmm",
  family_outcome = "gaussian",
  control_outcome = controlOut(k=1)
)

pmm2 <- nonprob(
  outcome = y2 ~ x1 + x2,
  data = population[population$flag_bd1 == 1, , drop = FALSE],
  svydesign = sample_prob,
  method_outcome = "pmm",
  family_outcome = "gaussian",
  control_outcome = controlOut(k=1)
)

# manually

df_prob <- population[population$flag_srs == 1, ]
df_non  <- population[population$flag_bd1 == 1, ]

lm1 <- lm(y1 ~ x1 + x2, data = df_non)
lm2 <- lm(y2 ~ x1 + x2, data = df_non)

pred1 <- predict(lm1, newdata = df_prob)
pred2 <- predict(lm2, newdata = df_prob)

nn1 <- sapply(pred1, FUN = function(x) {which.min(abs(df_non$y1 - x))})
nn2 <- sapply(pred2, FUN = function(x) {which.min(abs(df_non$y2 - x))})

(est1 <- weighted.mean(x = df_non$y1[nn1],
                       w = df_prob$base_w_srs))
pmm1$output$mean

(est2 <- weighted.mean(x = df_non$y2[nn2],
                       w = df_prob$base_w_srs))
pmm2$output$mean
population[, c("y1", "y2"), drop = FALSE] |> colMeans()

xx1 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1))
# the same
xx11 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1), search = "linear")

xx111 <- RANN::nn2(data = cbind(df_non$y1), query = cbind(pred1), k = 1, eps = 0)

cbind(nn1, xx1$id, pmm1$outcome$y1$model_rand$nn.idx, xx111$nn.idx)