USING TIDY DATA PRINCIPLES
library(tidyverse)
cheeses <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-06-04/cheeses.csv') |>
filter(!is.na(flavor))
glimpse(cheeses)
#> Rows: 1,089
#> Columns: 19
#> $ cheese <chr> "Aarewasser", "Abbaye de Belloc", "Abbaye de Citeaux",…
#> $ url <chr> "https://www.cheese.com/aarewasser/", "https://www.che…
#> $ milk <chr> "cow", "sheep", "cow", "cow", "cow", "cow", "cow", "sh…
#> $ country <chr> "Switzerland", "France", "France", "France", "France",…
#> $ region <chr> NA, "Pays Basque", "Burgundy", "Savoie", "province of …
#> $ family <chr> NA, NA, NA, NA, NA, NA, "Cheddar", NA, NA, NA, NA, "Fe…
#> $ type <chr> "semi-soft", "semi-hard, artisan", "semi-soft, artisan…
#> $ fat_content <chr> NA, NA, NA, NA, NA, "50%", NA, "45%", NA, NA, NA, NA, …
#> $ calcium_content <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
#> $ texture <chr> "buttery", "creamy, dense, firm", "creamy, dense, smoo…
#> $ rind <chr> "washed", "natural", "washed", "washed", "washed", "wa…
#> $ color <chr> "yellow", "yellow", "white", "white", "pale yellow", "…
#> $ flavor <chr> "sweet", "burnt caramel", "acidic, milky, smooth", "fr…
#> $ aroma <chr> "buttery", "lanoline", "barnyardy, earthy", "perfumed,…
#> $ vegetarian <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, …
#> $ vegan <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
#> $ synonyms <chr> NA, "Abbaye Notre-Dame de Belloc", NA, NA, NA, NA, "Ab…
#> $ alt_spellings <chr> NA, NA, NA, "Tamié, Trappiste de Tamie, Abbey of Tamie…
#> $ producers <chr> "Jumi", NA, NA, NA, "Abbaye Cistercienne NOTRE-DAME DE…
Cheese data from https://www.cheese.com/ via Tidy Tuesday
What is a typical way to represent this text data for modeling?
library(tidytext)
dtm <- cheeses |>
mutate(id = row_number()) |>
unnest_tokens(word, flavor) |>
anti_join(get_stopwords(), by = "word") |>
count(id, word) |>
bind_tf_idf(word, id, n) |>
cast_dfm(id, word, tf_idf)
dtm
#> Document-feature matrix of: 1,089 documents, 46 features (94.37% sparse) and 0 docvars.
Tip
This representation is incredibly sparse, of high dimensionality, and can have a huge number of features for natural language.
John Rupert Firth
library(wordsalad)
flavor_embeddings <-
cheeses |>
mutate(flavor = str_remove_all(flavor, ",")) |>
pull(flavor) |>
glove()
flavor_embeddings
#> # A tibble: 40 × 11
#> tokens V1 V2 V3 V4 V5 V6 V7 V8
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 vegetal -0.510 -0.187 -0.520 0.365 -9.50e-1 0.501 -0.335 6.27e-2
#> 2 burnt -0.137 -0.0292 1.28 0.0815 -2.30e-1 -0.144 -0.172 1.49e-1
#> 3 buttersco… 0.420 -0.901 0.0999 -0.00618 -6.14e-1 0.0612 -0.421 2.47e-2
#> 4 yeasty -0.314 0.103 0.0435 0.928 -2.35e-1 0.117 -0.0847 2.80e-1
#> 5 pronounced 0.179 0.0916 0.419 -0.136 -1.21e-4 -0.0412 -0.838 -3.44e-4
#> 6 tart 0.0123 -0.421 -0.173 -0.204 -1.03e-1 0.0591 -0.251 -1.89e-1
#> 7 woody -0.171 0.438 -0.404 0.809 -5.81e-1 -0.452 0.460 5.52e-1
#> 8 meaty -0.133 1.34 -0.0374 0.295 -9.38e-1 -0.240 -0.336 -1.31e-1
#> 9 floral -0.0764 0.459 -0.453 -0.589 -9.20e-2 0.209 -0.634 -2.95e-2
#> 10 pungent -0.278 0.674 0.0385 0.642 -9.86e-2 0.540 -0.543 4.24e-1
#> # ℹ 30 more rows
#> # ℹ 2 more variables: V9 <dbl>, V10 <dbl>
Let’s create an overall embedding for each cheese (using mean()
):
tidy_cheeses <-
cheeses |>
mutate(cheese_id = row_number()) |>
unnest_tokens(word, flavor) |>
left_join(flavor_embeddings, by = c("word" = "tokens")) |>
group_by(cheese_id, cheese, milk, country, type) |>
summarize(across(V1:V10, ~ mean(.x, na.rm = TRUE)), .groups = "drop")
tidy_cheeses
#> # A tibble: 1,089 × 15
#> cheese_id cheese milk country type V1 V2 V3 V4 V5
#> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 Aarewass… cow Switze… semi… 0.336 -0.480 -0.534 -0.568 2.21
#> 2 2 Abbaye d… sheep France semi… 0.0531 -0.422 1.21 -0.269 0.152
#> 3 3 Abbaye d… cow France semi… -0.0880 -0.196 -0.683 -0.181 1.21
#> 4 4 Abbaye d… cow France soft… 0.420 -0.439 0.391 -0.631 1.10
#> 5 5 Abbaye d… cow France semi… -0.0247 -0.0794 -0.802 -0.0671 1.53
#> 6 6 Abbaye d… cow France semi… -0.130 -0.229 -0.787 -0.110 1.46
#> 7 7 Abbot’s … cow Englan… semi… 0.0237 -0.412 -0.564 -0.503 1.70
#> 8 8 Abertam sheep Czech … hard… 0.169 0.225 -0.355 -0.161 0.705
#> 9 9 Abondance cow France semi… 0.415 -0.538 0.0781 -0.550 1.37
#> 10 10 Acapella goat United… soft… -0.298 -0.896 -0.0998 0.0855 1.11
#> # ℹ 1,079 more rows
#> # ℹ 5 more variables: V6 <dbl>, V7 <dbl>, V8 <dbl>, V9 <dbl>, V10 <dbl>
embeddings_mat <-
tidy_cheeses |>
select(V1:V10) |>
as.matrix()
row.names(embeddings_mat) <- cheeses$cheese
embeddings_similarity <- embeddings_mat / sqrt(rowSums(embeddings_mat * embeddings_mat))
embeddings_similarity <- embeddings_similarity %*% t(embeddings_similarity)
dim(embeddings_similarity)
#> [1] 1089 1089
Tip
This contains the similarity scores for each cheese flavor compared to each other cheese flavor.
Let’s say we are most interesting in this particular cheese:
U N S C R A M B L E
filter(cheese == "Manchego") |>
select(cheese, country, flavor)
cheeses |>
Let’s say we are most interesting in this particular cheese:
enframe(embeddings_similarity["Manchego",], name = "cheese", value = "similarity") |>
arrange(-similarity)
#> # A tibble: 1,089 × 2
#> cheese similarity
#> <chr> <dbl>
#> 1 Baskeriu 1
#> 2 Beemster Classic 1
#> 3 Butternut 1
#> 4 Coulommiers 1
#> 5 Loma Alta 1
#> 6 Manchego 1
#> 7 Prairie Tomme 1
#> 8 Pleasant Creek 0.987
#> 9 Ardrahan 0.981
#> 10 Moses Sleeper 0.979
#> # ℹ 1,079 more rows
cheeses |>
filter(cheese %in% c("Beemster Classic", "Butternut", "Loma Alta")) |>
select(cheese, country, flavor)
#> # A tibble: 3 × 3
#> cheese country flavor
#> <chr> <chr> <chr>
#> 1 Beemster Classic Netherlands buttery, nutty
#> 2 Butternut United States buttery, nutty
#> 3 Loma Alta United States buttery, nutty
cheeses |>
filter(cheese %in% c("Bayley Hazen Blue", "Alpha Tolman", "Cuor di burrata")) |>
select(cheese, country, flavor)
#> # A tibble: 3 × 3
#> cheese country flavor
#> <chr> <chr> <chr>
#> 1 Alpha Tolman United States buttery, caramel, fruity, full-flavored, nutty
#> 2 Bayley Hazen Blue United States buttery, grassy, licorice, nutty, tangy
#> 3 Cuor di burrata Italy buttery, milky, sweet
What about the least similar cheeses to Manchego?
enframe(embeddings_similarity["Manchego",], name = "cheese", value = "similarity") |>
arrange(similarity)
#> # A tibble: 1,089 × 2
#> cheese similarity
#> <chr> <dbl>
#> 1 Bossa -0.770
#> 2 Minger -0.732
#> 3 St James -0.716
#> 4 Caprano -0.685
#> 5 St Cera -0.685
#> 6 Little Qualicum Raclette -0.635
#> 7 Saint Nectaire -0.627
#> 8 Sosha -0.612
#> 9 Pecorino di Sogliano -0.546
#> 10 Pecorino di Talamello -0.546
#> # ℹ 1,079 more rows
cheeses |>
filter(cheese %in% c("Bossa", "St Cera", "Minger")) |>
select(cheese, country, flavor)
#> # A tibble: 3 × 3
#> cheese country flavor
#> <chr> <chr> <chr>
#> 1 Bossa United States floral, meaty
#> 2 Minger Scotland full-flavored, garlicky, meaty, pungent, strong
#> 3 St Cera England full-flavored, pronounced
Embeddings are trained or learned from a large corpus of text data
Human prejudice or bias in the corpus becomes imprinted into the embeddings
African American first names are associated with more unpleasant feelings than European American first names
Women’s first names are more associated with family and men’s first names are more associated with career
Terms associated with women are more associated with the arts and terms associated with men are more associated with science
Embeddings are trained or learned from a large corpus of text data
For example, consider the case of Wikipedia
Wikipedia both reflects social/historical biases and generates bias
Embeddings can be reprojected to mitigate a specific bias (such as gender bias) using specific sets of words
Training data can be augmented with counterfactuals
Other researchers suggest that fairness corrections occur at a decision
Evidence indicates that debiasing still allows stereotypes to seep back in
Slides created with Quarto