Machine learning with tidymodels - 5

Build better predictors 🛠️

Some models require predictors with certain characteristics or a certain format
Some datasets are better modeled when one or more predictors are transformed

Build better predictors 🛠️

Statistical parameters for recipe steps can be estimated from an initial data set and then applied to other data sets
The resulting features can be used as inputs for statistical or machine learning models

Abalone data spending

library(tidymodels)
library(tidyverse)
abalone <- read_csv("abalone.csv") %>% mutate_if(is.character, as.factor)

set.seed(123)
ring_split <- initial_split(abalone, prop = 0.8, strata = rings)
ring_train <- training(ring_split)
ring_test <- testing(ring_split)

set.seed(234)
ring_folds <- vfold_cv(ring_train, v = 5, strata = rings)
ring_folds
#> #  5-fold cross-validation using stratification 
#> # A tibble: 5 × 2
#>   splits             id   
#>   <list>             <chr>
#> 1 <split [2670/670]> Fold1
#> 2 <split [2672/668]> Fold2
#> 3 <split [2672/668]> Fold3
#> 4 <split [2673/667]> Fold4
#> 5 <split [2673/667]> Fold5

A first recipe

ring_rec <- 
  recipe(rings ~ ., data = ring_train)

The recipe() function assigns columns to roles of “outcome” or “predictor” using the formula

A first recipe

summary(ring_rec)
#> # A tibble: 9 × 4
#>   variable       type    role      source  
#>   <chr>          <chr>   <chr>     <chr>   
#> 1 sex            nominal predictor original
#> 2 length         numeric predictor original
#> 3 diameter       numeric predictor original
#> 4 height         numeric predictor original
#> 5 whole_weight   numeric predictor original
#> 6 shucked_weight numeric predictor original
#> 7 viscera_weight numeric predictor original
#> 8 shell_weight   numeric predictor original
#> 9 rings          numeric outcome   original

A first recipe

ring_rec <- 
  recipe(rings ~ ., data = ring_train)

Create indicator variables

ring_rec <- 
  recipe(rings ~ ., data = ring_train) %>% 
  step_dummy(all_nominal_predictors())

rings	length	diameter	height	whole_weight	shucked_weight	viscera_weight	shell_weight	sex_infant	sex_male
7	0.350	0.265	0.090	0.226	0.100	0.048	0.070	0	1
7	0.330	0.255	0.080	0.205	0.090	0.040	0.055	1	0
7	0.355	0.280	0.085	0.290	0.095	0.040	0.115	1	0
7	0.365	0.295	0.080	0.256	0.097	0.043	0.100	0	1
8	0.465	0.355	0.105	0.480	0.227	0.124	0.125	0	1
8	0.450	0.355	0.105	0.522	0.237	0.116	0.145	0	0
5	0.240	0.175	0.045	0.070	0.032	0.024	0.020	1	0
5	0.205	0.150	0.055	0.042	0.025	0.015	0.012	1	0
4	0.210	0.150	0.050	0.042	0.018	0.013	0.015	1	0
7	0.390	0.295	0.095	0.203	0.088	0.045	0.075	1	0

Normalization

ring_rec <- 
  recipe(rings ~ ., data = ring_train) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_normalize(all_numeric_predictors())

rings	length	diameter	height	whole_weight	shucked_weight	viscera_weight	shell_weight	sex_infant	sex_male
7	-1.453	-1.444	-1.284	-1.228	-1.166	-1.203	-1.215	-0.683	1.307
7	-1.620	-1.545	-1.545	-1.270	-1.211	-1.286	-1.323	1.463	-0.765
7	-1.411	-1.292	-1.414	-1.095	-1.187	-1.286	-0.888	1.463	-0.765
7	-1.327	-1.140	-1.545	-1.166	-1.178	-1.254	-0.997	-0.683	1.307
8	-0.489	-0.532	-0.891	-0.708	-0.592	-0.511	-0.816	-0.683	1.307
8	-0.615	-0.532	-0.891	-0.619	-0.547	-0.580	-0.671	-0.683	-0.765
5	-2.374	-2.356	-2.461	-1.546	-1.473	-1.433	-1.577	1.463	-0.765
5	-2.668	-2.610	-2.200	-1.604	-1.500	-1.511	-1.635	1.463	-0.765
4	-2.626	-2.610	-2.330	-1.604	-1.536	-1.533	-1.613	1.463	-0.765
7	-1.118	-1.140	-1.153	-1.274	-1.220	-1.235	-1.178	1.463	-0.765

Reduce correlation

ring_rec <- 
  recipe(rings ~ ., data = ring_train) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_corr(all_numeric_predictors(), threshold = 0.9)

rings	height	shucked_weight	shell_weight	sex_infant	sex_male
7	-1.284	-1.166	-1.215	-0.683	1.307
7	-1.545	-1.211	-1.323	1.463	-0.765
7	-1.414	-1.187	-0.888	1.463	-0.765
7	-1.545	-1.178	-0.997	-0.683	1.307
8	-0.891	-0.592	-0.816	-0.683	1.307
8	-0.891	-0.547	-0.671	-0.683	-0.765
5	-2.461	-1.473	-1.577	1.463	-0.765
5	-2.200	-1.500	-1.635	1.463	-0.765
4	-2.330	-1.536	-1.613	1.463	-0.765
7	-1.153	-1.220	-1.178	1.463	-0.765

Dimensionality reduction

ring_rec <- 
  recipe(rings ~ ., data = ring_train) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_pca(all_numeric_predictors())

rings	PC1	PC2	PC3	PC4	PC5
7	-2.930	2.248	-0.254	0.297	-0.143
7	-4.017	-0.495	0.308	0.346	-0.117
7	-3.570	-0.608	0.352	0.191	-0.191
7	-2.786	2.207	-0.228	0.306	0.073
8	-1.309	1.838	-0.038	0.151	0.362
8	-1.541	0.133	-1.173	0.356	0.253
5	-5.254	-0.193	0.229	1.113	-0.299
5	-5.442	-0.142	0.193	1.078	-0.653
4	-5.486	-0.133	0.193	1.116	-0.584
7	-3.480	-0.625	0.318	-0.156	0.053

Build nonlinear features

ring_rec <- 
  recipe(rings ~ ., data = ring_train) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_ns(shucked_weight, deg_free = 4)

rings	length	diameter	height	whole_weight	viscera_weight	shell_weight	sex_infant	sex_male	shucked_weight_ns_1	shucked_weight_ns_2	shucked_weight_ns_3	shucked_weight_ns_4
7	-1.453	-1.444	-1.284	-1.228	-1.203	-1.215	-0.683	1.307	0.031	-0.181	0.392	-0.211
7	-1.620	-1.545	-1.545	-1.270	-1.286	-1.323	1.463	-0.765	0.023	-0.166	0.360	-0.194
7	-1.411	-1.292	-1.414	-1.095	-1.286	-0.888	1.463	-0.765	0.027	-0.174	0.378	-0.204
7	-1.327	-1.140	-1.545	-1.166	-1.254	-0.997	-0.683	1.307	0.029	-0.177	0.384	-0.207
8	-0.489	-0.532	-0.891	-0.708	-0.511	-0.816	-0.683	1.307	0.366	-0.206	0.450	-0.242
8	-0.615	-0.532	-0.891	-0.619	-0.580	-0.671	-0.683	-0.765	0.410	-0.194	0.425	-0.229
5	-2.374	-2.356	-2.461	-1.546	-1.433	-1.577	1.463	-0.765	0.001	-0.062	0.134	-0.072
5	-2.668	-2.610	-2.200	-1.604	-1.511	-1.635	1.463	-0.765	0.000	-0.050	0.108	-0.058
4	-2.626	-2.610	-2.330	-1.604	-1.533	-1.613	1.463	-0.765	0.000	-0.034	0.073	-0.040
7	-1.118	-1.140	-1.153	-1.274	-1.235	-1.178	1.463	-0.765	0.021	-0.163	0.354	-0.191

Your turn

Create a recipe() for the abalone data to :

create one-hot indicator variables
remove zero-variance variables

03:00

Minimal recipe

ring_rec <-
  recipe(rings ~ ., data = ring_train) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

Using a workflow

set.seed(3)

lm_wf <- workflow(ring_rec, linear_reg()) 
ctrl_abalone <- control_resamples(save_pred = TRUE)
lm_res <- fit_resamples(lm_wf, ring_folds, control = ctrl_abalone)

collect_metrics(lm_res)
#> # A tibble: 2 × 6
#>   .metric .estimator  mean     n std_err .config             
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1 rmse    standard   2.20      5 0.0413  Preprocessor1_Model1
#> 2 rsq     standard   0.533     5 0.00833 Preprocessor1_Model1

Your turn

Use fit_resamples() to fit your workflow with a recipe.

Collect the predictions from the results.

05:00

Holdout predictions

# since we used `save_pred = TRUE`
ring_lm_preds <- collect_predictions(lm_res)
ring_lm_preds %>% group_by(id) %>% slice(1:3)
#> # A tibble: 15 × 5
#> # Groups:   id [5]
#>    id    .pred  .row rings .config             
#>    <chr> <dbl> <int> <dbl> <chr>               
#>  1 Fold1  7.86     1     7 Preprocessor1_Model1
#>  2 Fold1  8.17     3     7 Preprocessor1_Model1
#>  3 Fold1  7.42    10     7 Preprocessor1_Model1
#>  4 Fold2  9.97    11     7 Preprocessor1_Model1
#>  5 Fold2  8.27    13     7 Preprocessor1_Model1
#>  6 Fold2 10.7     14     8 Preprocessor1_Model1
#>  7 Fold3  8.67     6     8 Preprocessor1_Model1
#>  8 Fold3  5.06     7     5 Preprocessor1_Model1
#>  9 Fold3  6.33    12     6 Preprocessor1_Model1
#> 10 Fold4  8.57     4     7 Preprocessor1_Model1
#> 11 Fold4  8.23     5     8 Preprocessor1_Model1
#> 12 Fold4  5.16     8     5 Preprocessor1_Model1
#> 13 Fold5  6.65     2     7 Preprocessor1_Model1
#> 14 Fold5  5.69    17     4 Preprocessor1_Model1
#> 15 Fold5  5.94    33     7 Preprocessor1_Model1

Recipes are estimated

Preprocessing steps in a recipe use the training set to compute quantities

What kind of quantities are computed for preprocessing?

Levels of a factor
Whether a column has zero variance
Mean and standard deviation for normalization
How to map variables to principal components

When using a workflow, this estimation occurs with fit()

Fit different recipes

A workflow set can cross models and/or preprocessors:

set.seed(1)

abalone_set_res <-
  workflow_set(
    list(
      indicators = ring_rec, 
      decorr = ring_rec %>% step_corr(all_numeric_predictors(), threshold = 0.9), 
      splines = ring_rec %>% step_ns(shucked_weight, deg_free = 4), 
      pca = ring_rec %>% step_pca(all_numeric_predictors())
    ),
    list(lm = linear_reg())
  ) %>%
  workflow_map(
    fn = "fit_resamples", 
    resamples = ring_folds, 
    verbose = TRUE, 
    control = ctrl_abalone
  )

Your turn

Create a workflow set with 2 or 3 recipes.

(Consider using recipes we’ve already created.)

Use workflow_map() to resample the workflow set.

08:00

Compare recipes

library(forcats)
collect_metrics(abalone_set_res) %>%
  filter(.metric == "rmse") %>%
  ggplot(aes(x = mean, y = fct_reorder(wflow_id, mean))) +
  geom_crossbar(aes(xmin = mean - std_err, xmax = mean + std_err)) +
  labs(y = NULL, x = "RMSE (holdout sets)")

Compare recipes

More on using recipes

Find recipe steps at https://www.tidymodels.org/find/recipes/
You can skip some steps on new data
The order of recipe steps matters
What happens when a recipe goes wrong? 😱

What happens when a recipe goes wrong? 😱

We recommend that you use a workflow() to estimate and apply a recipe
There are two lower-level functions for handling a recipe on its own, like for debugging

fit() ➡️ prep()

predict() ➡️ bake()

5 - Feature engineering

Build better predictors 🛠️

Build better predictors 🛠️

Abalone data spending

A first recipe

A first recipe

A first recipe

Create indicator variables

Normalization

Reduce correlation

Dimensionality reduction

Build nonlinear features

Your turn

Minimal recipe

Using a workflow

Your turn

Holdout predictions

Recipes are estimated

Fit different recipes

Your turn

Compare recipes

Compare recipes

More on using recipes

What happens when a recipe goes wrong? 😱

`fit()` ➡️ `prep()`

`predict()` ➡️ `bake()`

Your turn

5 - Feature engineering

Build better predictors 🛠️

Build better predictors 🛠️

Abalone data spending

A first recipe

A first recipe

A first recipe

Create indicator variables

Normalization

Reduce correlation

Dimensionality reduction

Build nonlinear features

Your turn

Minimal recipe

Using a workflow

Your turn

Holdout predictions

Recipes are estimated

Fit different recipes

Your turn

Compare recipes

Compare recipes

More on using recipes

What happens when a recipe goes wrong? 😱

fit() ➡️ prep()

predict() ➡️ bake()

Your turn

`fit()` ➡️ `prep()`

`predict()` ➡️ `bake()`