For fun and organisation scrape some horror films using functional R. Note not all of the films in the list I would personally classify as horror.
library(polite)
library(rvest)
#> Loading required package: xml2
library(tidyverse)
#> ── Attaching packages ───────── tidyverse 1.3.0 ──
#> ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
#> ✓ tibble 3.0.3 ✓ dplyr 1.0.2
#> ✓ tidyr 1.1.2 ✓ stringr 1.4.0
#> ✓ readr 1.3.1 ✓ forcats 0.5.0
#> ── Conflicts ──────────── tidyverse_conflicts() ──
#> x dplyr::filter() masks stats::filter()
#> x readr::guess_encoding() masks rvest::guess_encoding()
#> x dplyr::lag() masks stats::lag()
#> x purrr::pluck() masks rvest::pluck()
library(rlang)
#>
#> Attaching package: 'rlang'
#> The following objects are masked from 'package:purrr':
#>
#> %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
#> flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
#> splice
#> The following object is masked from 'package:xml2':
#>
#> as_list
library(googlesheets4)
Creates a function factory to pull out different css selectors on the page.
scrape_factory <- function(css) {
force(css)
function(p) {
p %>%
html_nodes(css) %>%
html_text() %>%
str_trim()
}
}
# css selectors as a list of functions
selectors <- list(
ranking = ".text-primary",
title = ".lister-item-content .col-title a",
year = ".text-muted",
imdb_rating = ".col-imdb-rating"
) %>%
map(scrape_factory)
# this will get the first 250 films starting from the base URL
pagination <- function(base_url, start = seq(1, 201, by = 50)) {
ifelse(start == 1,
base_url,
paste0(base_url, ",desc&start=", start, "&ref_=adv_nxt")
)
}
base_url <- "https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=horror&sort=user_rating,desc&view=simple&sort=user_rating"
page <- map(pagination(base_url), ~scrape(bow(.)))
# for each page call the selector functions and collapse to a table
# then row bind the results and tidy some of the columns
top250_horror <- map_dfr(page, ~map_dfc(selectors, exec, .)) %>%
mutate(
ranking = as.integer(ranking),
imdb_rating = as.numeric(imdb_rating),
has_casey_seen_it = NA # the most prolific watcher will curate this column
)
top250_horror
#> # A tibble: 250 x 5
#> ranking title year imdb_rating has_casey_seen_it
#> <int> <chr> <chr> <dbl> <lgl>
#> 1 1 Psycho (1960) 8.5 NA
#> 2 2 The Shining (1980) 8.4 NA
#> 3 3 Alien (1979) 8.4 NA
#> 4 4 The Thing (1982) 8.1 NA
#> 5 5 What Ever Happened to Baby Jane? (1962) 8.1 NA
#> 6 6 The Cabinet of Dr. Caligari (1920) 8.1 NA
#> 7 7 The Exorcist (1973) 8 NA
#> 8 8 Rosemary's Baby (1968) 8 NA
#> 9 9 Les diaboliques (1955) 8 NA
#> 10 10 Let the Right One In (2008) 7.9 NA
#> # … with 240 more rows
gs_sheet_url <- "your-sheet-url"
sheet <- gs4_get(gs_sheet_url)
sheet_write(top250_horror, sheet, sheet = "imdb-top")