Lab report

The main structure of this repo is given by three folders:

-- code
|   |-- scrape-presslog.R       # contains functions one_page and duplicates
-- data
|   |-- presslog-20210209.csv   # data files in csv form   
...
|   |-- presslog-20230220.csv
-- presslogs                    # pdf files of press logs

Details on each part of the lab

Load packages

library(tidyverse, quietly = TRUE)
library(tabulizer)
library(lubridate, quietly = TRUE)

The duplicates function takes a character string and checks whether it consists of a long string of duplicate entries. In that case, only the first value is returned.

duplicates <- function(string) {
  # find the first '\r', delete everything afterwards
  # str_locate(string, pattern='\r')
  stopifnot(length(string)==1)
  splitter <- strsplit(string, split="\r")
  res <- unique(splitter[[1]])
  if (length(res) > 1) warning(sprintf("Multiple different values found in string: %s", paste(res, collapse=",")))
  res[1]
}

The one_page function takes a matrix of entries from the Ames Press Log and returns a data frame of the data in cleaner form. Note that we only return columns that have a name and are not all empty.

one_page <- function(plogi) {
  # take one of the pages, make into a data frame
  
  # Variables are in the first data row:
  variables <- plogi[1,]
  # remove the '\r'
  variables <- gsub("\r"," ", variables)
  
  plogi <- plogi[-1,, drop=FALSE]
  
  for (i in 1:ncol(plogi))
    plogi[,i] <- plogi[,i] %>% purrr::map_chr(.f = duplicates)
  
  # remove empty columns
  idx <- which(variables == "")
  if (length(idx) > 0) {
    if (all(is.na(plogi[,idx]))) {
      plogi <- plogi[,-idx, drop=FALSE]
      variables <- variables[-idx]
    }
  }
  dframe <- as_tibble(plogi, .name_repair = "minimal") 
  names(dframe) <- variables
  
  dframe <- dframe %>% mutate(
    `Incident ID` = parse_number(`Incident ID`),
    `Report Number Assigned to Event` = parse_number(`Report Number Assigned to Event`)
  )
  dframe
}

Download the current press log, save it into a file called PressLog-XXX.pdf, where XXX contains a string of today’s date.

todays_pdf <- sprintf("presslogs/PressLog-%s.pdf", lubridate::today())
if (!file.exists(todays_pdf)) # only download once
  download.file("https://data.city.ames.ia.us/publicinformation/PressLog.pdf", 
                destfile = sprintf("presslogs/PressLog-%s.pdf", lubridate::today()))

todays_pdf <- sprintf("presslogs/PressLog-%s.pdf", lubridate::today())

We are using the tabulizer functionality to extract values from the pdf. We specify the output to be a matrix using the method lattice (to reduce the number of possible return values).

plog <- extract_tables(todays_pdf, output='matrix',  method="lattice")

# extract the data one page at a time, combine the result into one data frame
all_pages <- plog %>% purrr::map_df(.f = one_page)

# the save the values into a csv file
date <- lubridate::today()
write_csv(all_pages, file=sprintf("data/presslog-%s%02d%02d.csv",year(date), 
                                  month(date), mday(date)))

Get the call codes by running the locate_areas function on one of the pdfs:

foo <- locate_areas(todays_pdf, pages=1)

codes <- c(474.79543,  20.19263, 546.14273, 747.12743)
names(codes) <- c("top", "left", "bottom", "right")

date <- c(31.90366, 700.01128,  49.40395, 764.62771 )
names(date) <- c("top", "left", "bottom", "right")

title <- c(39.98072, 142.69461,  65.55805, 652.89514)
names(title) <- c("top", "left", "bottom", "right")

call_codes <- extract_tables(todays_pdf, pages=1, area = list(codes), guess = FALSE)

# turn all the codes into a single string
call_codes <- as.vector(paste(call_codes[[1]][,1], collapse = ""))

# now split the string along each comma:
codes_list <- str_split(call_codes, pattern=",")[[1]]

# get rid of leading and trailing white spaces
codes_list <- trimws(codes_list)

# now split the string along each equal symbol:
codes <- str_split(codes_list, pattern="=")
codes_df <- codes %>% purrr::map(.f = function(x) {
  list(code = x[1], description = x[2])
  }) %>% transpose() %>% as_tibble %>% unnest(cols = 1:2)

write.csv(codes_df, "data/call_codes.csv")

Process all of the other pdf files.

pdfs <- dir("presslogs", pattern="pdf", full.names = TRUE)
# remove today's press log:
pdfs <- setdiff(pdfs, todays_pdf)

for (pdf_file in pdfs) {
  plog <- extract_tables(pdf_file, output='matrix', method="lattice")
  
  all_pages <- plog %>% purrr::map_df(.f = one_page)
  date <- lubridate::mdy_hm(all_pages$`Call Received Date/Time`[1])
  write_csv(all_pages, file=sprintf("data/presslog-%s%02d%02d.csv",year(date), 
                                    month(date), mday(date)))
}

Stat585-at-ISU/Lab-2-after

Lab report

Details on each part of the lab