Simulating F-statistics

library(tidyverse)

## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Let’s create 5000 data points from a multi-group comparison with no difference in group means

n_data <- 5000
n_per_group <- 10
n_groups <- 3
overall_mean <- 0
overal_sd <- 1

set.seed(1)
sim_df <- bind_rows(lapply(seq_len(5000), function(x){
  tibble(
    id = x,
    value = rnorm(n = n_per_group * n_groups, 
                  mean = overall_mean,
                  sd = overal_sd),
    group = rep(letters[seq_len(n_groups)], each = 10)
  )
}))

As an example we will inspect one of the instances:

filter(sim_df, id == 20) %>% 
  ggplot(aes(x = group, y = value)) +
  geom_dotplot(binaxis='y', stackdir='center') +
  stat_summary(fun=mean, geom="point", 
               shape=18, color="red")

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Now we will compute, both the group mean and the overall mean for each instance (unique id):

sim_df <- sim_df %>% 
  group_by(id) %>% 
  mutate(obs_overall_mean = mean(value)) %>% 
  group_by(id, group) %>% 
  mutate(obs_group_mean = mean(value)) %>% 
  ungroup()

sim_df

## # A tibble: 150,000 x 5
##       id  value group obs_overall_mean obs_group_mean
##    <int>  <dbl> <chr>            <dbl>          <dbl>
##  1     1 -0.626 a               0.0825          0.132
##  2     1  0.184 a               0.0825          0.132
##  3     1 -0.836 a               0.0825          0.132
##  4     1  1.60  a               0.0825          0.132
##  5     1  0.330 a               0.0825          0.132
##  6     1 -0.820 a               0.0825          0.132
##  7     1  0.487 a               0.0825          0.132
##  8     1  0.738 a               0.0825          0.132
##  9     1  0.576 a               0.0825          0.132
## 10     1 -0.305 a               0.0825          0.132
## # … with 149,990 more rows

And now we’ll compute the residuals within groups and globally:

sim_df <- sim_df %>% 
  mutate(overall_residuals = value - obs_overall_mean,
         group_residuals = value - obs_group_mean) 

sim_df

## # A tibble: 150,000 x 7
##       id  value group obs_overall_mean obs_group_mean overall_residua…
##    <int>  <dbl> <chr>            <dbl>          <dbl>            <dbl>
##  1     1 -0.626 a               0.0825          0.132           -0.709
##  2     1  0.184 a               0.0825          0.132            0.101
##  3     1 -0.836 a               0.0825          0.132           -0.918
##  4     1  1.60  a               0.0825          0.132            1.51 
##  5     1  0.330 a               0.0825          0.132            0.247
##  6     1 -0.820 a               0.0825          0.132           -0.903
##  7     1  0.487 a               0.0825          0.132            0.405
##  8     1  0.738 a               0.0825          0.132            0.656
##  9     1  0.576 a               0.0825          0.132            0.493
## 10     1 -0.305 a               0.0825          0.132           -0.388
## # … with 149,990 more rows, and 1 more variable: group_residuals <dbl>

Next, we will compute residual sum of squares

rss_df <- sim_df %>% 
  group_by(id) %>% 
  summarise(RSS_0 = sum(overall_residuals^2),
            RSS_1 = sum(group_residuals^2)) %>% 
  ungroup

## `summarise()` ungrouping output (override with `.groups` argument)

hist(rss_df$RSS_0, probability = TRUE)
lines(seq(min(rss_df$RSS_0), max(rss_df$RSS_0), 0.1),
      dchisq(seq(min(rss_df$RSS_0), max(rss_df$RSS_0), 0.1), df = n_groups * n_per_group - 1), col = "red")

hist(rss_df$RSS_1, probability = TRUE)
lines(seq(min(rss_df$RSS_1), max(rss_df$RSS_1), 0.1),
      dchisq(seq(min(rss_df$RSS_1), max(rss_df$RSS_1), 0.1), df = n_groups * n_per_group - n_groups), col = "red")

hist(rss_df$RSS_0 - rss_df$RSS_1, probability = TRUE)
lines(seq(min(rss_df$RSS_0 - rss_df$RSS_1), 
          max(rss_df$RSS_0 - rss_df$RSS_1), 0.1),
      dchisq(seq(min(rss_df$RSS_0 - rss_df$RSS_1), 
                 max(rss_df$RSS_0 - rss_df$RSS_1), 0.1), df = n_groups - 1), col = "red")

d1 <- n_groups - 1
d2 <- n_groups * n_per_group - n_groups
Fstat <- (rss_df$RSS_0 - rss_df$RSS_1)/rss_df$RSS_1 * d2/d1

hist(Fstat, probability = TRUE)
lines(seq(min(Fstat), max(Fstat), 0.1),
      df(seq(min(Fstat), max(Fstat), 0.1), 
         df1 = d1, df2 = d2), col = "red")

nkurzaw/simulate_fstat

Simulating F-statistics