NLMITC19: A repository from erictleung

Load libraries
Query data
General tweet prevalence over time
Most prolific tweeters?
Relationship between follower count and tweet popularity
Chatterplot of tweet words
Session information

Load libraries

library(tidyverse)
library(tidytext)
library(ggrepel)

if (!requireNamespace("rtweet", quietly = TRUE)) install.packages("rtweet")
library(rtweet)

Query data

Below is the code to query the Twitter data for the #NLMITC19. I ran this at 2019-06-28 22:50.

rt <- search_tweets("#NLMITC19 OR #NLMIT19", n = 1800, include_rts = FALSE)

saveRDS(rt, "nlmitc19_search.rds")
saveRDS(rt$status_id, "nlmitc19_search-ids.rds")

But instead, here I’ll just look up the status IDs.

ids_file <- "nlmitc19_search-ids.rds"
nlmitc19_file <- "nlmitc19_search.rds"


# Read in search directly if exists
if (file.exists(nlmitc19_file)) {
  rt <- readRDS(nlmitc19_file)
} else {
  # Download status IDs file
  download.file(
    "https://github.com/erictleung/NLMITC19/blob/master/data/nlmitc19_search-ids.rds?raw=true",
    ids_file
  )

  # Read status IDs from downloaded file
  ids <- readRDS(ids_file)


  # Lookup data associated with status ids
  rt <- rtweet::lookup_tweets(ids)
}

General tweet prevalence over time

Code modified from rstudioconf_tweets.

rt %>%
  ts_plot("30 minutes", color = "transparent") +
  geom_smooth(method = "loess",
              se = FALSE,
              span = 0.05,
              size = 2,
              color = "#0066aa") +
  geom_point(size = 5,
             shape = 21,
             fill = "#ADFF2F99",
             color = "#000000dd") +

  # ggplot2 theme 
  theme_minimal(base_size = 15) +
  theme(axis.text = element_text(colour = "#222222"),
        plot.title = element_text(size = rel(1.7), face = "bold"),
        plot.subtitle = element_text(size = rel(1.3)),
        plot.caption = element_text(colour = "#444444")) +

  # Caption information
  labs(title = "Frequency of tweets about #NLMITC19 over time",
       subtitle = "Twitter status counts aggregated using half-hour intervals",
       caption = "\n\nSource: Data gathered via Twitter's standard `search/tweets` API using rtweet",
       x = NULL, y = NULL)

Makes sense considering there were two days of conference time.

Most prolific tweeters?

rt %>%
  group_by(screen_name) %>%
  summarise(tweets = n()) %>%
  ggplot(aes(x = tweets, y = reorder(screen_name, tweets))) +
  geom_point() +

  # Theme styling information
  theme_minimal(base_size = 15) +
  theme(axis.text = element_text(colour = "#222222"),
        plot.title = element_text(size = rel(1.7), face = "bold"),
        plot.subtitle = element_text(size = rel(1.3)),
        plot.caption = element_text(colour = "#444444")) +

  # Labels
  labs(title = "Top tweeters using\n#NLMITC19 or #NLMIT19",
       x = "Total number of tweets",
       y = "Twitter username",
       caption = "\n\nSource: Data gathered via Twitter's standard `search/tweets` API using rtweet")

Relationship between follower count and tweet popularity

Do more followers have more popular tweets?

I take the average number of favorite of an individual’s tweets and normalize it based on the total number of tweets.

rt %>%
  # Preprocess and count average favorites normalized by number of tweets
  group_by(screen_name) %>%
  mutate(avg_fav = mean(favorite_count)) %>%
  mutate(avg_norm_fav = avg_fav / n()) %>%
  ungroup() %>%
  select(screen_name, avg_fav, avg_norm_fav, followers_count) %>%
  distinct() %>%

  # Offset to not create infinite values when log transforming
  mutate(followers_count = followers_count + 0.001) %>%
  mutate(avg_norm_fav = avg_norm_fav + 0.001) %>%

  # Plot results
  ggplot(aes(x = followers_count, y = avg_norm_fav, label = screen_name)) +
  geom_text_repel() +
  geom_point() +

  # Use log-scale for x-axis and y-axis
  labs(title = "Average normalized number of favorites\nversus user follower count",
       x = "Number of followers",
       y = "Average normalized number of favorites",
       caption = "\nSource: Data gathered via Twitter's standard `search/tweets` API using rtweet") +

  # Theme styling information
  theme_minimal(base_size = 15) +
  theme(axis.text = element_text(colour = "#222222"),
        plot.title = element_text(size = rel(1.7), face = "bold"),
        plot.subtitle = element_text(size = rel(1.3)),
        plot.caption = element_text(colour = "#444444"))

Chatterplot of tweet words

rt_no_stop <- rt %>%
  # Just look at tweet text
  select(text, favorite_count) %>%
  
  # Remove web links
  mutate(text = str_replace_all(text, "https?[:graph:]+", "'")) %>%

  # Remove mentions
  # Rule are that names are alphanumeric and can have underscores.
  # Names can also be preceeded with "." or end with some punctuation
  # Twitter:
  #   help.twitter.com/en/managing-your-account/twitter-username-rules
  # To avoid emails:
  #   stackoverflow.com/questions/4424179/how-to-validate-a-twitter-username-using-regex#comment21201837_4424288
  mutate(text = str_replace_all(text,
                                "\\.?@([:alnum:]|_){1,15}(?![.A-Za-z])[:graph:]?",
                                "")) %>%

  # Tokenize text to just single words
  unnest_tokens(word, text) %>%

  # Remove stop words (e.g., "a", "the", "and", etc)
  anti_join(get_stopwords())
## Joining, by = "word"


# Get average number of favorites
rt_word_avg_fav <- rt_no_stop %>%
  # Average favorite count
  group_by(word) %>%
  summarize(avg_fav = mean(favorite_count))


# Count number of mentions
rt_counts <- rt_no_stop %>%
  # Create word counts
  count(word, sort = TRUE)


# Filter low counts and join counts and average favorite score
chatter_rt <- rt_counts %>%
  filter(n > 1) %>%
  filter(word != "nlmitc19") %>%
  left_join(rt_word_avg_fav, by = "word")

Code below modified from “RIP wordclouds, long live CHATTERPLOTS”.

chatter_rt %>%
  # Add small offset average favorite counts because some are zero and we log
  # transform, which can introduce infinite values
  mutate(avg_fav = avg_fav + 0.001) %>%

  # Gather just top 100 mentions
  top_n(100, wt = n) %>%
  
  ggplot(aes(x = avg_fav, y = n, label = word)) +
  geom_text_repel(segment.alpha = 0,
                  aes(colour = avg_fav, size = n)) +

  # Set color gradient,log transform & customize legend
  scale_color_gradient(low = "green3", high = "violetred", 
                       trans = "log10",
                       guide = guide_colourbar(direction = "horizontal",
                                               title.position = "top")) +
  # Set word size range & turn off legend
  scale_size_continuous(range = c(3, 10),
                        guide = FALSE) +

  # Use log-scale for x-axis
  scale_x_log10() +
  ggtitle(paste0("Top 100 words from ",
                  nrow(rt),
                 " #NLMITC19 tweets, by frequency"),
          subtitle = "Word frequency (size) ~ Avg number of favorites (color)") + 
  labs(y = "Word frequency across all tweets",
       x = "Avg number of favorites in tweets containing word (log scale)",
       colour = "Avg num of favs (log)") +
  
  # minimal theme & customizations
  theme_minimal() +
  theme(legend.position = c(0.20, 0.99),
        legend.justification = c("right","top"),
        panel.grid.major = element_line(colour = "whitesmoke"))

Session information

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rtweet_0.6.8    ggrepel_0.8.0   tidytext_0.2.0  forcats_0.4.0  
##  [5] stringr_1.4.0   dplyr_0.8.0.1   purrr_0.3.2     readr_1.3.1    
##  [9] tidyr_0.8.3     tibble_2.1.1    ggplot2_3.1.0   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5  xfun_0.5          haven_2.1.0      
##  [4] lattice_0.20-38   colorspace_1.4-1  generics_0.0.2   
##  [7] htmltools_0.3.6   SnowballC_0.6.0   yaml_2.2.0       
## [10] rlang_0.3.2       pillar_1.3.1      glue_1.3.1       
## [13] withr_2.1.2       modelr_0.1.4      readxl_1.3.1     
## [16] plyr_1.8.4        munsell_0.5.0     gtable_0.3.0     
## [19] cellranger_1.1.0  rvest_0.3.2       evaluate_0.13    
## [22] labeling_0.3      knitr_1.22        broom_0.5.1      
## [25] tokenizers_0.2.1  Rcpp_1.0.1        scales_1.0.0     
## [28] backports_1.1.3   jsonlite_1.6      stopwords_0.9.0  
## [31] hms_0.4.2         digest_0.6.18     stringi_1.4.3    
## [34] grid_3.5.0        cli_1.1.0         tools_3.5.0      
## [37] magrittr_1.5      lazyeval_0.2.2    janeaustenr_0.1.5
## [40] crayon_1.3.4      pkgconfig_2.0.2   Matrix_1.2-16    
## [43] xml2_1.2.0        lubridate_1.7.4   assertthat_0.2.1 
## [46] rmarkdown_1.12    httr_1.4.0        rstudioapi_0.10  
## [49] R6_2.4.0          nlme_3.1-137      compiler_3.5.0