`include_reference = TRUE` erroneously works with `datawizard::contr.deviation()`

Question

`include_reference = TRUE` erroneously works with `datawizard::contr.deviation()`

mattansb opened this issue 6 months ago · 6 comments

include_reference = TRUE should only add the referent when standard dummy coding is used, but for some reason it also adds a (wrong) 0 when datawizard::contr.deviation() is used.

contrs <- list(
  # makes sense:
  treatment = contr.treatment,
  SAS = contr.SAS,
  
  # doens't make sense:
  deviation = datawizard::contr.deviation,
  sum = contr.sum,
  helmert = contr.helmert,
  poly = contr.poly,
  equalprior = bayestestR::contr.equalprior
)

data("mtcars")
mtcars$cyl <- factor(mtcars$cyl)

lapply(contrs, function(.c) {
  lm(mpg ~ cyl, data = mtcars, contrasts = list(cyl = .c))
}) |> 
  parameters::compare_parameters(include_reference = TRUE, select = "est")
#> Parameter    | treatment |   SAS | deviation |   sum | helmert |  poly | equalprior
#> -----------------------------------------------------------------------------------
#> (Intercept)  |     26.66 | 15.10 |     20.50 | 20.50 |   20.50 | 20.50 |      20.50
#> cyl (4)      |      0.00 | 11.56 |      0.00 |       |         |       |           
#> cyl (6)      |     -6.92 |  4.64 |     -6.92 |       |         |       |           
#> cyl (8)      |    -11.56 |  0.00 |    -11.56 |       |         |       |           
#> cyl (1)      |           |       |           |  6.16 |   -3.46 |       |      -3.28
#> cyl (2)      |           |       |           | -0.76 |   -2.70 |       |       7.55
#> cyl (.L)     |           |       |           |       |         | -8.18 |           
#> cyl (.Q)     |           |       |           |       |         |  0.93 |           
#> -----------------------------------------------------------------------------------
#> Observations |        32 |    32 |        32 |    32 |      32 |    32 |         32

Answer 1 · 2024-04-10T06:07:13.000Z

Is there a safe way to find out which contrasts were used?

contrs <- list(
  # makes sense:
  treatment = contr.treatment,
  SAS = contr.SAS,
  
  # doens't make sense:
  deviation = datawizard::contr.deviation,
  sum = contr.sum,
  helmert = contr.helmert,
  poly = contr.poly,
  equalprior = bayestestR::contr.equalprior
)

data("mtcars")
mtcars$cyl <- factor(mtcars$cyl)

models <- lapply(contrs, function(.c) {
  lm(mpg ~ cyl, data = mtcars, contrasts = list(cyl = .c))
})

lapply(models, function(i) i$contrasts)
#> $treatment
#> $treatment$cyl
#>   6 8
#> 4 0 0
#> 6 1 0
#> 8 0 1
#> 
#> 
#> $SAS
#> $SAS$cyl
#>   4 6
#> 4 1 0
#> 6 0 1
#> 8 0 0
#> 
#> 
#> $deviation
#> $deviation$cyl
#>            6          8
#> 4 -0.3333333 -0.3333333
#> 6  0.6666667 -0.3333333
#> 8 -0.3333333  0.6666667
#> 
#> 
#> $sum
#> $sum$cyl
#>   [,1] [,2]
#> 4    1    0
#> 6    0    1
#> 8   -1   -1
#> 
#> 
#> $helmert
#> $helmert$cyl
#>   [,1] [,2]
#> 4   -1   -1
#> 6    1   -1
#> 8    0    2
#> 
#> 
#> $poly
#> $poly$cyl
#>              .L         .Q
#> 4 -7.071068e-01  0.4082483
#> 6 -7.850462e-17 -0.8164966
#> 8  7.071068e-01  0.4082483
#> 
#> 
#> $equalprior
#> $equalprior$cyl
#>         [,1]       [,2]
#> 4  0.0000000  0.8164966
#> 6 -0.7071068 -0.4082483
#> 8  0.7071068 -0.4082483

^{Created on 2024-04-10 with reprex v2.1.0}

Answer 2 · 2024-04-10T08:20:14.000Z

Is there a safe way to find out which contrasts were used?

No, I don't think so, but it should only be done if the contrast matrix has a row of all 0s.

contrs <- list(
  # makes sense:
  treatment = contr.treatment,
  SAS = contr.SAS,
  
  # doens't make sense:
  deviation = datawizard::contr.deviation,
  sum = contr.sum,
  helmert = contr.helmert,
  poly = contr.poly,
  equalprior = bayestestR::contr.equalprior
)

data("mtcars")
mtcars$cyl <- factor(mtcars$cyl)

models <- lapply(contrs, function(.c) {
  lm(mpg ~ cyl, data = mtcars, contrasts = list(cyl = .c))
})

all_zero_row <- function(m) {
  apply(m == 0, 1, all)
}

has_zeros_row <- function(m) {
  any(all_zero_row(m))
}

lapply(models, function(i) {
  lapply(i$contrasts, has_zeros_row)
})
#> $treatment
#> $treatment$cyl
#> [1] TRUE
#> 
#> 
#> $SAS
#> $SAS$cyl
#> [1] TRUE
#> 
#> 
#> $deviation
#> $deviation$cyl
#> [1] FALSE
#> 
#> 
#> $sum
#> $sum$cyl
#> [1] FALSE
#> 
#> 
#> $helmert
#> $helmert$cyl
#> [1] FALSE
#> 
#> 
#> $poly
#> $poly$cyl
#> [1] FALSE
#> 
#> 
#> $equalprior
#> $equalprior$cyl
#> [1] FALSE

^{Created on 2024-04-10 with reprex v2.1.0}

Answer 3 · 2024-04-26T06:51:55.000Z

library(parameters)
data("mtcars")
mtcars$cyl <- factor(mtcars$cyl)
mtcars$gear <- factor(mtcars$gear)

m <- lm(mpg ~ cyl + gear, data = mtcars, contrasts = list(cyl = datawizard::contr.deviation))
model_parameters(m, include_reference = TRUE)
#> Parameter   | Coefficient |   SE |          95% CI | t(27) |      p
#> -------------------------------------------------------------------
#> (Intercept) |       19.70 | 1.18 | [ 17.28, 22.11] | 16.71 | < .001
#> cyl [6]     |       -6.66 | 1.63 | [-10.00, -3.31] | -4.09 | < .001
#> cyl [8]     |      -10.54 | 1.96 | [-14.56, -6.52] | -5.38 | < .001
#> gear [3]    |        0.00 |      |                 |       |       
#> gear [4]    |        1.32 | 1.93 | [ -2.63,  5.28] |  0.69 | 0.498 
#> gear [5]    |        1.50 | 1.85 | [ -2.31,  5.31] |  0.81 | 0.426
#> 
#> Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
#>   using a Wald t-distribution approximation.

m <- lm(mpg ~ cyl + gear, data = mtcars)
model_parameters(m, include_reference = TRUE)
#> Parameter   | Coefficient |   SE |          95% CI | t(27) |      p
#> -------------------------------------------------------------------
#> (Intercept) |       25.43 | 1.88 | [ 21.57, 29.29] | 13.52 | < .001
#> cyl [4]     |        0.00 |      |                 |       |       
#> cyl [6]     |       -6.66 | 1.63 | [-10.00, -3.31] | -4.09 | < .001
#> cyl [8]     |      -10.54 | 1.96 | [-14.56, -6.52] | -5.38 | < .001
#> gear [3]    |        0.00 |      |                 |       |       
#> gear [4]    |        1.32 | 1.93 | [ -2.63,  5.28] |  0.69 | 0.498 
#> gear [5]    |        1.50 | 1.85 | [ -2.31,  5.31] |  0.81 | 0.426
#> 
#> Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
#>   using a Wald t-distribution approximation.

m <- lm(
  mpg ~ cyl + gear,
  data = mtcars,
  contrasts = list(
    cyl = datawizard::contr.deviation,
    gear = contr.sum
  )
)
model_parameters(m, include_reference = TRUE)
#> Parameter   | Coefficient |   SE |          95% CI | t(27) |      p
#> -------------------------------------------------------------------
#> (Intercept) |       20.64 | 0.67 | [ 19.26, 22.01] | 30.76 | < .001
#> cyl [6]     |       -6.66 | 1.63 | [-10.00, -3.31] | -4.09 | < .001
#> cyl [8]     |      -10.54 | 1.96 | [-14.56, -6.52] | -5.38 | < .001
#> gear [1]    |       -0.94 | 1.09 | [ -3.18,  1.30] | -0.86 | 0.396 
#> gear [2]    |        0.38 | 1.11 | [ -1.90,  2.67] |  0.34 | 0.734
#> 
#> Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
#>   using a Wald t-distribution approximation.

m <- lm(
  mpg ~ cyl + gear,
  data = mtcars,
  contrasts = list(
    cyl = contr.SAS,
    gear = contr.sum
  )
)
model_parameters(m, include_reference = TRUE)
#> Parameter   | Coefficient |   SE |         95% CI | t(27) |      p
#> ------------------------------------------------------------------
#> (Intercept) |       15.83 | 1.24 | [13.28, 18.37] | 12.75 | < .001
#> cyl [8]     |        0.00 |      |                |       |       
#> cyl [4]     |       10.54 | 1.96 | [ 6.52, 14.56] |  5.38 | < .001
#> cyl [6]     |        3.89 | 1.88 | [ 0.03,  7.75] |  2.07 | 0.049 
#> gear [1]    |       -0.94 | 1.09 | [-3.18,  1.30] | -0.86 | 0.396 
#> gear [2]    |        0.38 | 1.11 | [-1.90,  2.67] |  0.34 | 0.734
#> 
#> Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
#>   using a Wald t-distribution approximation.

m <- lm(
  mpg ~ cyl + gear,
  data = mtcars,
  contrasts = list(
    cyl = contr.SAS,
    gear = contr.treatment
  )
)
model_parameters(m, include_reference = TRUE)
#> Parameter   | Coefficient |   SE |         95% CI | t(27) |      p
#> ------------------------------------------------------------------
#> (Intercept) |       14.89 | 0.92 | [13.00, 16.77] | 16.19 | < .001
#> cyl [8]     |        0.00 |      |                |       |       
#> cyl [4]     |       10.54 | 1.96 | [ 6.52, 14.56] |  5.38 | < .001
#> cyl [6]     |        3.89 | 1.88 | [ 0.03,  7.75] |  2.07 | 0.049 
#> gear [3]    |        0.00 |      |                |       |       
#> gear [4]    |        1.32 | 1.93 | [-2.63,  5.28] |  0.69 | 0.498 
#> gear [5]    |        1.50 | 1.85 | [-2.31,  5.31] |  0.81 | 0.426
#> 
#> Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
#>   using a Wald t-distribution approximation.

^{Created on 2024-04-26 with reprex v2.1.0}

Answer 4 · 2024-04-26T06:54:40.000Z

This currently works for those model objects that have stored their contrasts as model$contrasts. We need to make this more robust for other models, too - or does every package with modelling functions save their contrasts like this?

Answer 5 · 2024-04-26T06:59:52.000Z

glmmTMB saves it as function...

library(glmmTMB)
data("mtcars")
mtcars$cyl <- factor(mtcars$cyl)
mtcars$gear <- factor(mtcars$gear)

m <- glmmTMB(mpg ~ cyl + (1 | gear), data = mtcars, contrasts = list(cyl = contr.sum))
m$modelInfo$contrasts
#> $cyl
#> function (n, contrasts = TRUE, sparse = FALSE) 
#> {
#>     if (length(n) <= 1L) {
#>         if (is.numeric(n) && length(n) == 1L && n > 1L) 
#>             levels <- seq_len(n)
#>         else stop("not enough degrees of freedom to define contrasts")
#>     }
#>     else levels <- n
#>     levels <- as.character(levels)
#>     cont <- .Diag(levels, sparse = sparse)
#>     if (contrasts) {
#>         cont <- cont[, -length(levels), drop = FALSE]
#>         cont[length(levels), ] <- -1
#>         colnames(cont) <- NULL
#>     }
#>     cont
#> }
#> <bytecode: 0x000002727afd2770>
#> <environment: namespace:stats>

^{Created on 2024-04-26 with reprex v2.1.0}

Answer 6 · 2024-04-30T20:24:20.000Z

You can apply those functions (with some n>2) and then run the test:

all_zero_row <- function(m) {
  apply(m == 0, 1, all)
}

has_zeros_row <- function(m) {
  any(all_zero_row(m))
}


library(glmmTMB)
data("mtcars")
mtcars$cyl <- factor(mtcars$cyl)
mtcars$gear <- factor(mtcars$gear)

m <- glmmTMB(
  mpg ~ cyl + gear,
  data = mtcars,
  contrasts = list(cyl = contr.sum, gear = contr.treatment)
)

glmmtmb_contr_foos <- m$modelInfo$contrasts

glmmtmb_contr_foos |> 
  lapply(\(f) f(3)) |> 
  lapply(has_zeros_row)
#> $cyl
#> [1] FALSE
#> 
#> $gear
#> [1] TRUE
#>