pairwise_similarity has an undocumented minimum
DataStrategist opened this issue · 0 comments
DataStrategist commented
a bit of a longwinded RepEx, but:
library(gutenbergr)
#> Warning: package 'gutenbergr' was built under R version 3.5.3
library(tidytext)
library(tidyverse)
#> -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.1 --
#> v ggplot2 3.1.0 v purrr 0.3.1
#> v tibble 2.0.1 v dplyr 0.8.0.1
#> v tidyr 0.8.3 v stringr 1.3.1
#> v readr 1.3.1 v forcats 0.3.0
#> -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
library(widyr)
#> Warning: package 'widyr' was built under R version 3.5.3
TI <- gutenberg_works(title == "Treasure Island") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "T.I.")
#> Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
#> Using mirror http://aleph.gutenberg.org
Wi <- gutenberg_works(title == "The Wonderful Wizard of Oz") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "Wiz")
Co <- gutenberg_works(title == "The United States Constitution") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "Con")
JFK <- gutenberg_works(title == "John F. Kennedy's Inaugural Address") %>% pull(gutenberg_id) %>%
gutenberg_download(.) %>% unnest_tokens(., word, text) %>%
count(word, sort = TRUE) %>% mutate(source = "JFK")
## Combine
df <- bind_rows(TI, Wi, Co, JFK)
## Do similarity
df %>%
bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE)
#> # A tibble: 6 x 3
#> item1 item2 similarity
#> <chr> <chr> <dbl>
#> 1 Wiz T.I. 0.349
#> 2 Con JFK 0.0513
#> 3 T.I. JFK 0.0483
#> 4 Con T.I. 0.0314
#> 5 Wiz JFK 0.0301
#> 6 Con Wiz 0.0155
## So far so good, but what if I wanted to see which is most likely to say "I love you"?
Love <- tibble(word = rep("I love you", 10), source = "TEST") %>% unnest_tokens(word, word) %>%
count(source, word, sort = TRUE)
## With four sources it's possible:
bind_rows(Love, df) %>%
bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE) %>%
filter(item1 == "TEST") %>% select(-item1)
#> # A tibble: 4 x 2
#> item2 similarity
#> <chr> <dbl>
#> 1 T.I. 0.0654
#> 2 Wiz 0.0526
#> 3 JFK 0.0267
#> 4 Con 0
## But with only two, it errors out:
df2 <- bind_rows(TI, Wi)
bind_rows(Love, df2) %>%
bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE) %>%
filter(item1 == "TEST") %>% select(-item1)
#> Error in `colnames<-`(`*tmp*`, value = c("item1", "item2", "value")): attempt to set 'colnames' on an object with less than two dimensions
## How come?
The limit should probably be documented no?