library(tidymodels)
sim_df <- sim_regression(num_samples = 1e3)
glimpse(sim_df)
#> Rows: 1,000
#> Columns: 21
#> $ outcome <dbl> -4.928525, 31.243149, -32.990575, 19.297869, 33.558983, 2…
#> $ predictor_01 <dbl> 1.45775279, 0.82638847, -4.92052194, 6.46431441, -0.82217…
#> $ predictor_02 <dbl> 4.116466132, 2.610214425, 0.675752837, 2.570178449, 2.142…
#> $ predictor_03 <dbl> 5.4458548, -3.1014774, 1.1625076, 3.4390711, 3.3372852, -…
#> $ predictor_04 <dbl> -1.01284231, 0.81065762, -3.13098977, -1.75485152, 3.2372…
#> $ predictor_05 <dbl> 1.54527981, 0.74044008, -3.97539047, -4.60811097, 1.07821…
#> $ predictor_06 <dbl> -6.34279526, 4.20078207, 2.14755389, -1.79727132, 3.49558…
#> $ predictor_07 <dbl> 0.9105746, -1.0624028, 4.1461805, 2.6135963, 1.8528900, -…
#> $ predictor_08 <dbl> 3.6025893, -0.4133759, -5.2788528, -3.4689416, -6.0568117…
#> $ predictor_09 <dbl> -3.8147997, -2.3600506, -2.3775382, -2.3569280, 2.2462209…
#> $ predictor_10 <dbl> -2.9292750, -4.6403150, -1.5438041, 1.1909429, -1.4302187…
#> $ predictor_11 <dbl> 1.954786777, -0.458930773, 0.368848197, -5.505276854, 0.8…
#> $ predictor_12 <dbl> 0.56734762, 3.55299901, 4.06405586, -3.82337471, -0.79193…
#> $ predictor_13 <dbl> 2.59202997, 0.90878202, -0.37713684, -3.44231442, 0.47654…
#> $ predictor_14 <dbl> 0.20347530, -1.33296884, -2.83929165, -2.02793712, -2.090…
#> $ predictor_15 <dbl> 1.6814880, 4.1218454, -1.6344148, 0.8939713, -4.0727849, …
#> $ predictor_16 <dbl> -2.23030461, 0.52884386, 5.80379562, -3.97012420, -1.5965…
#> $ predictor_17 <dbl> 1.79706540, 2.61977702, 1.75621990, 2.85542716, 2.1002845…
#> $ predictor_18 <dbl> 1.5624638, -1.3170392, 0.3488918, 0.3782973, -3.0179789, …
#> $ predictor_19 <dbl> -2.89790636, -2.31515186, 4.77557443, -4.31079590, 3.7908…
#> $ predictor_20 <dbl> -1.44944642, 6.12993329, 5.39468018, 0.43952610, -2.98379…
predictor_01 + sin(predictor_02) + log(abs(predictor_03)) +
predictor_04^2 + predictor_05 * predictor_06 +
ifelse(predictor_07 * predictor_08 * predictor_09 < 0, 1, 0) +
ifelse(predictor_10 > 0, 1, 0) + predictor_11 * ifelse(predictor_11 > 0, 1, 0) +
sqrt(abs(predictor_12)) + cos(predictor_13) + 2 * predictor_14 + abs(predictor_15) +
ifelse(predictor_16 < -1, 1, 0) + predictor_17 * ifelse(predictor_17 < -1, 1, 0) -
2 * predictor_18 - predictor_19 * predictor_20
This function uses 20 independent Gaussian random predictors.
folds <- vfold_cv(sim_df, strata = outcome)
folds
#> # 10-fold cross-validation using stratification
#> # A tibble: 10 × 2
#> splits id
#> <list> <chr>
#> 1 <split [900/100]> Fold01
#> 2 <split [900/100]> Fold02
#> 3 <split [900/100]> Fold03
#> 4 <split [900/100]> Fold04
#> 5 <split [900/100]> Fold05
#> 6 <split [900/100]> Fold06
#> 7 <split [900/100]> Fold07
#> 8 <split [900/100]> Fold08
#> 9 <split [900/100]> Fold09
#> 10 <split [900/100]> Fold10
doParallel::registerDoParallel()
workflow(
outcome ~ .,
rand_forest(mode = "regression", trees = 1e3)
) %>%
fit_resamples(folds) %>%
collect_metrics()
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 rmse standard 15.8 10 0.573 Preprocessor1_Model1
#> 2 rsq standard 0.554 10 0.0140 Preprocessor1_Model1
collect_simulated_metrics <- function(num_samples, v) {
sim_df <- sim_regression(num_samples = num_samples)
folds <- vfold_cv(sim_df, v = v, strata = outcome)
workflow(
outcome ~ .,
rand_forest(mode = "regression", trees = 1e3)
) %>%
fit_resamples(folds) %>%
collect_metrics()
}
collect_simulated_metrics(num_samples = 1e3, v = 3)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 rmse standard 15.8 3 0.594 Preprocessor1_Model1
#> 2 rsq standard 0.569 3 0.00488 Preprocessor1_Model1
metrics_sim <-
tibble(v = rep(seq(4, 24, by = 2), 100)) %>%
mutate(metrics = map(v, collect_simulated_metrics, num_samples = 1e3)) %>%
unnest(metrics)
metrics_sim
#> # A tibble: 2,200 × 7
#> v .metric .estimator mean n std_err .config
#> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 4 rmse standard 15.3 4 1.25 Preprocessor1_Model1
#> 2 4 rsq standard 0.530 4 0.0354 Preprocessor1_Model1
#> 3 6 rmse standard 15.0 6 0.489 Preprocessor1_Model1
#> 4 6 rsq standard 0.518 6 0.0236 Preprocessor1_Model1
#> 5 8 rmse standard 15.4 8 0.462 Preprocessor1_Model1
#> 6 8 rsq standard 0.536 8 0.0242 Preprocessor1_Model1
#> 7 10 rmse standard 15.2 10 0.420 Preprocessor1_Model1
#> 8 10 rsq standard 0.549 10 0.0202 Preprocessor1_Model1
#> 9 12 rmse standard 15.1 12 0.561 Preprocessor1_Model1
#> 10 12 rsq standard 0.548 12 0.0232 Preprocessor1_Model1
#> # … with 2,190 more rows
Run a simulation 💥
Run a simulation 💃
Run a simulation 🕺
Run a simulation 👯
Run a simulation 💁♀️
simulate_interaction <- function(num_samples, effect_size) {
dat <- matrix(rnorm(num_samples * 2, sd = 2), ncol = 2)
colnames(dat) <- paste0("predictor_0", 1:2)
tibble::as_tibble(dat) %>%
mutate(
outcome = predictor_01 + predictor_02 + effect_size * predictor_01 * predictor_02,
outcome = outcome + rnorm(num_samples, sd = 2)
) %>%
relocate(outcome)
}
simulate_interaction(100, 0.1)
#> # A tibble: 100 × 3
#> outcome predictor_01 predictor_02
#> <dbl> <dbl> <dbl>
#> 1 4.99 0.945 3.99
#> 2 -7.40 -2.58 -2.19
#> 3 3.17 3.58 2.63
#> 4 -1.50 -2.51 -2.02
#> 5 0.266 1.48 0.815
#> 6 -1.08 0.675 -1.03
#> 7 -7.09 -1.01 -2.71
#> 8 3.82 2.08 1.81
#> 9 -1.16 0.396 -1.42
#> 10 2.16 0.573 1.24
#> # … with 90 more rows
df_sim <- simulate_interaction(100, 0.1)
lm_fitted <- lm(outcome ~ predictor_01 * predictor_02, data = df_sim)
tidy(lm_fitted)
#> # A tibble: 4 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.401 0.203 1.97 5.13e- 2
#> 2 predictor_01 0.902 0.103 8.76 6.71e-14
#> 3 predictor_02 1.18 0.104 11.4 1.34e-19
#> 4 predictor_01:predictor_02 0.00377 0.0495 0.0760 9.40e- 1
simulate_power <- function(num_samples, effect_size) {
df_sim <- simulate_interaction(num_samples, effect_size)
lm_fitted <- lm(outcome ~ predictor_01 * predictor_02, data = df_sim)
tidy(lm_fitted) %>%
filter(term == "predictor_01:predictor_02") %>%
summarize(sig = p.value < 0.05) %>%
pull(sig)
}
rerun(100, simulate_power(100, 0.1)) %>%
flatten_lgl() %>%
mean()
#> [1] 0.41
power_sim_results <-
crossing(
num_samples = seq(100, 900, by = 200),
effect_size = seq(-0.08, 0.08, by = 0.01)
) %>%
mutate(power = map2_dbl(
num_samples, effect_size,
~ rerun(1e3, simulate_power(.x, .y)) %>%
flatten_lgl() %>%
mean())
)
power_sim_results
#> # A tibble: 85 × 3
#> num_samples effect_size power
#> <dbl> <dbl> <dbl>
#> 1 100 -0.08 0.329
#> 2 100 -0.07 0.256
#> 3 100 -0.06 0.206
#> 4 100 -0.05 0.169
#> 5 100 -0.04 0.127
#> 6 100 -0.03 0.098
#> 7 100 -0.02 0.072
#> 8 100 -0.0100 0.061
#> 9 100 0 0.046
#> 10 100 0.0100 0.062
#> # … with 75 more rows
The nullabor package provides tools for graphical inference.
library(nullabor)
simulate_viz_inference <- function(num_samples) {
dat <- matrix(rnorm(num_samples * 2), ncol = 2)
colnames(dat) <- paste0("predictor_0", 1:2)
tibble::as_tibble(dat) %>%
mutate(
outcome = predictor_01 + log(abs(predictor_02)),
outcome = outcome + rnorm(num_samples)
) %>%
relocate(outcome)
}
simulate_viz_inference(100)
#> # A tibble: 100 × 3
#> outcome predictor_01 predictor_02
#> <dbl> <dbl> <dbl>
#> 1 -0.339 -0.236 -0.742
#> 2 -0.164 0.128 0.773
#> 3 -2.67 -0.689 0.253
#> 4 1.29 1.43 -0.607
#> 5 2.09 1.59 -1.52
#> 6 -2.79 -0.248 0.0431
#> 7 -1.92 -0.326 -0.472
#> 8 0.675 0.184 -0.692
#> 9 -3.41 -2.22 -0.267
#> 10 1.17 1.42 -1.23
#> # … with 90 more rows