@juliasilge

@juliasilge

youtube.com/juliasilge

juliasilge.com

Simulation is a powerful tool to…

  • make assumptions concrete
  • get on the same page with colleagues about tradeoffs
  • ultimately make better decisions for our everyday data science tasks

How many folds is too many?

library(tidymodels)

sim_df <- sim_regression(num_samples = 1e3)

How many folds is too many?

library(tidymodels)

sim_df <- sim_regression(num_samples = 1e3)
glimpse(sim_df)
#> Rows: 1,000
#> Columns: 21
#> $ outcome      <dbl> -4.928525, 31.243149, -32.990575, 19.297869, 33.558983, 2…
#> $ predictor_01 <dbl> 1.45775279, 0.82638847, -4.92052194, 6.46431441, -0.82217…
#> $ predictor_02 <dbl> 4.116466132, 2.610214425, 0.675752837, 2.570178449, 2.142…
#> $ predictor_03 <dbl> 5.4458548, -3.1014774, 1.1625076, 3.4390711, 3.3372852, -…
#> $ predictor_04 <dbl> -1.01284231, 0.81065762, -3.13098977, -1.75485152, 3.2372…
#> $ predictor_05 <dbl> 1.54527981, 0.74044008, -3.97539047, -4.60811097, 1.07821…
#> $ predictor_06 <dbl> -6.34279526, 4.20078207, 2.14755389, -1.79727132, 3.49558…
#> $ predictor_07 <dbl> 0.9105746, -1.0624028, 4.1461805, 2.6135963, 1.8528900, -…
#> $ predictor_08 <dbl> 3.6025893, -0.4133759, -5.2788528, -3.4689416, -6.0568117…
#> $ predictor_09 <dbl> -3.8147997, -2.3600506, -2.3775382, -2.3569280, 2.2462209…
#> $ predictor_10 <dbl> -2.9292750, -4.6403150, -1.5438041, 1.1909429, -1.4302187…
#> $ predictor_11 <dbl> 1.954786777, -0.458930773, 0.368848197, -5.505276854, 0.8…
#> $ predictor_12 <dbl> 0.56734762, 3.55299901, 4.06405586, -3.82337471, -0.79193…
#> $ predictor_13 <dbl> 2.59202997, 0.90878202, -0.37713684, -3.44231442, 0.47654…
#> $ predictor_14 <dbl> 0.20347530, -1.33296884, -2.83929165, -2.02793712, -2.090…
#> $ predictor_15 <dbl> 1.6814880, 4.1218454, -1.6344148, 0.8939713, -4.0727849, …
#> $ predictor_16 <dbl> -2.23030461, 0.52884386, 5.80379562, -3.97012420, -1.5965…
#> $ predictor_17 <dbl> 1.79706540, 2.61977702, 1.75621990, 2.85542716, 2.1002845…
#> $ predictor_18 <dbl> 1.5624638, -1.3170392, 0.3488918, 0.3782973, -3.0179789, …
#> $ predictor_19 <dbl> -2.89790636, -2.31515186, 4.77557443, -4.31079590, 3.7908…
#> $ predictor_20 <dbl> -1.44944642, 6.12993329, 5.39468018, 0.43952610, -2.98379…

How many folds is too many?

predictor_01 + sin(predictor_02) + log(abs(predictor_03)) +
  predictor_04^2 + predictor_05 * predictor_06 +
  ifelse(predictor_07 * predictor_08 * predictor_09 < 0, 1, 0) +
  ifelse(predictor_10 > 0, 1, 0) + predictor_11 * ifelse(predictor_11 > 0, 1, 0) +
  sqrt(abs(predictor_12)) + cos(predictor_13) + 2 * predictor_14 + abs(predictor_15) +
  ifelse(predictor_16 < -1, 1, 0) + predictor_17 * ifelse(predictor_17 < -1, 1, 0) -
  2 * predictor_18 - predictor_19 * predictor_20

This function uses 20 independent Gaussian random predictors.

How many folds is too many?

folds <- vfold_cv(sim_df, strata = outcome)
folds
#> #  10-fold cross-validation using stratification 
#> # A tibble: 10 × 2
#>    splits            id    
#>    <list>            <chr> 
#>  1 <split [900/100]> Fold01
#>  2 <split [900/100]> Fold02
#>  3 <split [900/100]> Fold03
#>  4 <split [900/100]> Fold04
#>  5 <split [900/100]> Fold05
#>  6 <split [900/100]> Fold06
#>  7 <split [900/100]> Fold07
#>  8 <split [900/100]> Fold08
#>  9 <split [900/100]> Fold09
#> 10 <split [900/100]> Fold10

How many folds is too many?

doParallel::registerDoParallel()

workflow(
  outcome ~ ., 
  rand_forest(mode = "regression", trees = 1e3)
) %>%
  fit_resamples(folds) %>%
  collect_metrics()
#> # A tibble: 2 × 6
#>   .metric .estimator   mean     n std_err .config             
#>   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   15.8      10  0.573  Preprocessor1_Model1
#> 2 rsq     standard    0.554    10  0.0140 Preprocessor1_Model1

How many folds is too many?

collect_simulated_metrics <- function(num_samples, v) {
  sim_df <- sim_regression(num_samples = num_samples)
  folds <- vfold_cv(sim_df, v = v, strata = outcome)
  workflow(
    outcome ~ ., 
    rand_forest(mode = "regression", trees = 1e3)
  ) %>%
    fit_resamples(folds) %>%
    collect_metrics()
}

collect_simulated_metrics(num_samples = 1e3, v = 3)
#> # A tibble: 2 × 6
#>   .metric .estimator   mean     n std_err .config             
#>   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   15.8       3 0.594   Preprocessor1_Model1
#> 2 rsq     standard    0.569     3 0.00488 Preprocessor1_Model1

How many folds is too many?

metrics_sim <-
  tibble(v = rep(seq(4, 24, by = 2), 100)) %>%
  mutate(metrics = map(v, collect_simulated_metrics, num_samples = 1e3)) %>%
  unnest(metrics)

metrics_sim
#> # A tibble: 2,200 × 7
#>        v .metric .estimator   mean     n std_err .config             
#>    <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
#>  1     4 rmse    standard   15.3       4  1.25   Preprocessor1_Model1
#>  2     4 rsq     standard    0.530     4  0.0354 Preprocessor1_Model1
#>  3     6 rmse    standard   15.0       6  0.489  Preprocessor1_Model1
#>  4     6 rsq     standard    0.518     6  0.0236 Preprocessor1_Model1
#>  5     8 rmse    standard   15.4       8  0.462  Preprocessor1_Model1
#>  6     8 rsq     standard    0.536     8  0.0242 Preprocessor1_Model1
#>  7    10 rmse    standard   15.2      10  0.420  Preprocessor1_Model1
#>  8    10 rsq     standard    0.549    10  0.0202 Preprocessor1_Model1
#>  9    12 rmse    standard   15.1      12  0.561  Preprocessor1_Model1
#> 10    12 rsq     standard    0.548    12  0.0232 Preprocessor1_Model1
#> # … with 2,190 more rows

How many folds is too many?

metrics_sim %>%
  filter(.metric == "rmse") %>%
  group_by(v) %>%
  summarise(variance = median(std_err ^ 2 / n)) %>%
  ggplot(aes(v, variance)) +
  geom_point()

How many folds is too many?

How does bias change with folds?

Run a simulation 💥

What if you have more or less data?

Run a simulation 💃

What if you are going to use a different model?

Run a simulation 🕺

What if you want to do repeated CV?

Run a simulation 👯

What if you want to use the bootstrap?

Run a simulation 💁‍♀️

How many observations do we need?

predictor_01 + predictor_02

How many observations do we need?

predictor_01 + predictor_02 + predictor_01 * predictor_02

How many observations do we need?

predictor_01 + predictor_02 + effect_size * predictor_01 * predictor_02

How many observations do we need?

simulate_interaction <- function(num_samples, effect_size) {
  dat <- matrix(rnorm(num_samples * 2, sd = 2), ncol = 2)
  colnames(dat) <- paste0("predictor_0", 1:2)
  tibble::as_tibble(dat) %>%
    mutate(
      outcome = predictor_01 + predictor_02 + effect_size * predictor_01 * predictor_02,
      outcome = outcome + rnorm(num_samples, sd = 2)
    ) %>%
    relocate(outcome)
}

simulate_interaction(100, 0.1)
#> # A tibble: 100 × 3
#>    outcome predictor_01 predictor_02
#>      <dbl>        <dbl>        <dbl>
#>  1   4.99         0.945        3.99 
#>  2  -7.40        -2.58        -2.19 
#>  3   3.17         3.58         2.63 
#>  4  -1.50        -2.51        -2.02 
#>  5   0.266        1.48         0.815
#>  6  -1.08         0.675       -1.03 
#>  7  -7.09        -1.01        -2.71 
#>  8   3.82         2.08         1.81 
#>  9  -1.16         0.396       -1.42 
#> 10   2.16         0.573        1.24 
#> # … with 90 more rows

How many observations do we need?

df_sim <- simulate_interaction(100, 0.1)
lm_fitted <- lm(outcome ~ predictor_01 * predictor_02, data = df_sim)
tidy(lm_fitted)
#> # A tibble: 4 × 5
#>   term                      estimate std.error statistic  p.value
#>   <chr>                        <dbl>     <dbl>     <dbl>    <dbl>
#> 1 (Intercept)                0.401      0.203     1.97   5.13e- 2
#> 2 predictor_01               0.902      0.103     8.76   6.71e-14
#> 3 predictor_02               1.18       0.104    11.4    1.34e-19
#> 4 predictor_01:predictor_02  0.00377    0.0495    0.0760 9.40e- 1

How many observations do we need?

simulate_power <- function(num_samples, effect_size) {
  df_sim <- simulate_interaction(num_samples, effect_size)
  lm_fitted <- lm(outcome ~ predictor_01 * predictor_02, data = df_sim)
  tidy(lm_fitted) %>% 
    filter(term == "predictor_01:predictor_02") %>% 
    summarize(sig = p.value < 0.05) %>% 
    pull(sig)
}

rerun(100, simulate_power(100, 0.1)) %>%
  flatten_lgl() %>%
  mean()
#> [1] 0.41

How many observations do we need?

power_sim_results <-
  crossing(
    num_samples = seq(100, 900, by = 200),
    effect_size = seq(-0.08, 0.08, by = 0.01)
  ) %>%
  mutate(power = map2_dbl(
    num_samples, effect_size,
    ~ rerun(1e3, simulate_power(.x, .y)) %>%
      flatten_lgl() %>%
      mean())
  )

power_sim_results
#> # A tibble: 85 × 3
#>    num_samples effect_size power
#>          <dbl>       <dbl> <dbl>
#>  1         100     -0.08   0.329
#>  2         100     -0.07   0.256
#>  3         100     -0.06   0.206
#>  4         100     -0.05   0.169
#>  5         100     -0.04   0.127
#>  6         100     -0.03   0.098
#>  7         100     -0.02   0.072
#>  8         100     -0.0100 0.061
#>  9         100      0      0.046
#> 10         100      0.0100 0.062
#> # … with 75 more rows

How many observations do we need?

power_sim_results  %>%
  ggplot(aes(effect_size, power, group = num_samples, color = num_samples)) +
  geom_hline(yintercept = 0.8, linetype = "dashed", alpha = 0.5) +
  geom_line(linewidth = 1.5, alpha = 0.7) +
  scale_color_viridis_c() +
  scale_y_continuous(labels = scales::percent)

How many observations do we need?

How important is this relationship?

The nullabor package provides tools for graphical inference.

library(nullabor)

simulate_viz_inference <- function(num_samples) {
  dat <- matrix(rnorm(num_samples * 2), ncol = 2)
  colnames(dat) <- paste0("predictor_0", 1:2)
  tibble::as_tibble(dat) %>%
    mutate(
      outcome = predictor_01 + log(abs(predictor_02)),
      outcome = outcome + rnorm(num_samples)
    ) %>%
    relocate(outcome)
}

simulate_viz_inference(100)
#> # A tibble: 100 × 3
#>    outcome predictor_01 predictor_02
#>      <dbl>        <dbl>        <dbl>
#>  1  -0.339       -0.236      -0.742 
#>  2  -0.164        0.128       0.773 
#>  3  -2.67        -0.689       0.253 
#>  4   1.29         1.43       -0.607 
#>  5   2.09         1.59       -1.52  
#>  6  -2.79        -0.248       0.0431
#>  7  -1.92        -0.326      -0.472 
#>  8   0.675        0.184      -0.692 
#>  9  -3.41        -2.22       -0.267 
#> 10   1.17         1.42       -1.23  
#> # … with 90 more rows

How important is this relationship?

simulate_viz_inference(200) %>%
  ggplot(aes(predictor_01, outcome)) +
  geom_point(alpha = 0.4) 

How important is this relationship?

simulate_viz_inference(200) %>%
  ggplot(aes(predictor_02, outcome)) +
  geom_point(alpha = 0.4) 

How important is this relationship?

permuted <- lineup(null_permute("outcome"), simulate_viz_inference(200))

permuted %>%
  ggplot(aes(predictor_01, outcome)) +
  geom_point(alpha = 0.2) + 
  facet_wrap(~ .sample)

How important is this relationship?

How important is this relationship?

permuted <- lineup(null_permute("outcome"), simulate_viz_inference(200))

permuted %>%
  ggplot(aes(predictor_02, outcome)) +
  geom_point(alpha = 0.2) + 
  facet_wrap(~ .sample)

How important is this relationship?

Simulation is a powerful tool to…

  • make assumptions concrete
  • get on the same page with colleagues about tradeoffs
  • ultimately make better decisions for our everyday data science

@juliasilge

@juliasilge

youtube.com/juliasilge

juliasilge.com