ropensci/stats19

Ids in vehicle and casualty datasets not in 'accidents' table for some years

Robinlovelace opened this issue · 3 comments

Reproducible example from @layik just tested:

library(stats19)
#> Data provided under OGL v3.0. Cite the source and link to:
#> www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
    #> Data provided under OGL v3.0. Cite the source and link to:
    #> www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
    n = c("caNotINac","veNotINac","acNotINca", "acNotINve")
    check = function(year = 2019) {
        if(is.null(year) || is.na(year)) stop("Year is required")
        ac = get_stats19(year = year, type = "ac", output_format = "sf", data_dir = "/tmp/sanity")
        ca = get_stats19(year = year, type = "ca", data_dir = "/tmp/sanity")
        ve = get_stats19(year = year, type = "ve", data_dir = "/tmp/sanity")
        
        # all(ca$accident_index %in% ac$accident_index)
        # all(ve$accident_index %in% ac$accident_index)
        
        caNotINac = which(!ca$accident_index %in% ac$accident_index)
        veNotINac = which(!ve$accident_index %in% ac$accident_index)
        acNotINca = which(!ac$accident_index %in% ca$accident_index)
        acNotINve = which(!ac$accident_index %in% ve$accident_index)
        v = list(caNotINac, veNotINac, acNotINca, acNotINve)
        names(v) = n
        return(v)
    }
    checks = lapply(2015:2019, function(x) check(x))
#> Files identified: RoadSafetyData_Accidents_2015.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Accidents_2015.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/RoadSafetyData_Accidents_2015/Accidents_2015.csv
#> Reading in:
#> /tmp/sanity/RoadSafetyData_Accidents_2015/Accidents_2015.csv
#> date and time columns present, creating formatted datetime column
#> 27 rows removed with no coordinates
#> Files identified: RoadSafetyData_Casualties_2015.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Casualties_2015.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/RoadSafetyData_Casualties_2015/Casualties_2015.csv
#> Files identified: RoadSafetyData_Vehicles_2015.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Vehicles_2015.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/RoadSafetyData_Vehicles_2015/Vehicles_2015.csv
#> Files identified: dftRoadSafety_Accidents_2016.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafety_Accidents_2016.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafety_Accidents_2016/dftRoadSafety_Accidents_2016.csv
#> Reading in:
#> /tmp/sanity/dftRoadSafety_Accidents_2016/dftRoadSafety_Accidents_2016.csv
#> date and time columns present, creating formatted datetime column
#> 7 rows removed with no coordinates
#> Files identified: dftRoadSafetyData_Casualties_2016.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Casualties_2016.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Casualties_2016/Cas.csv
#> Files identified: dftRoadSafetyData_Vehicles_2016.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Vehicles_2016.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Vehicles_2016/Veh.csv
#> Files identified: dftRoadSafetyData_Accidents_2017.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Accidents_2017.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Accidents_2017/Acc.csv
#> Reading in:
#> /tmp/sanity/dftRoadSafetyData_Accidents_2017/Acc.csv
#> date and time columns present, creating formatted datetime column
#> 19 rows removed with no coordinates
#> Files identified: dftRoadSafetyData_Casualties_2017.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Casualties_2017.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Casualties_2017/Cas.csv
#> Files identified: dftRoadSafetyData_Vehicles_2017.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Vehicles_2017.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Vehicles_2017/Veh.csv
#> Files identified: dftRoadSafetyData_Accidents_2018.csv
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Accidents_2018.csv
#> Attempt downloading from:
#> Data saved at /tmp/sanity/dftRoadSafetyData_Accidents_2018.csv
#> Reading in:
#> /tmp/sanity/dftRoadSafetyData_Accidents_2018.csv
#> date and time columns present, creating formatted datetime column
#> 55 rows removed with no coordinates
#> Files identified: dftRoadSafetyData_Casualties_2018.csv
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Casualties_2018.csv
#> Attempt downloading from:
#> Data saved at /tmp/sanity/dftRoadSafetyData_Casualties_2018.csv
#> Files identified: dftRoadSafetyData_Vehicles_2018.csv
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Vehicles_2018.csv
#> Attempt downloading from:
#> Data saved at /tmp/sanity/dftRoadSafetyData_Vehicles_2018.csv
#> Files identified: DfTRoadSafety_Accidents_2019.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/DfTRoadSafety_Accidents_2019.zip
#> Attempt downloading from:
#> Data saved at /tmp/sanity/DfTRoadSafety_Accidents_2019/Road Safety Data - Accidents 2019.csv
#> Reading in:
#> /tmp/sanity/DfTRoadSafety_Accidents_2019/Road Safety Data - Accidents 2019.csv
#> date and time columns present, creating formatted datetime column
#> 28 rows removed with no coordinates
#> Files identified: DfTRoadSafety_Casualties_2019.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/DfTRoadSafety_Casualties_2019.zip
#> Attempt downloading from:
#> Data saved at /tmp/sanity/DfTRoadSafety_Casualties_2019/Road Safety Data - Casualties 2019.csv
#> Files identified: DfTRoadSafety_Vehicles_2019.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/DfTRoadSafety_Vehicles_2019.zip
#> Attempt downloading from:
#> Data saved at /tmp/sanity/DfTRoadSafety_Vehicles_2019/Road Safety Data- Vehicles 2019.csv
    #> Files identified: RoadSafetyData_Accidents_2015.zip
    #>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Accidents_2015.zip
    #> Attempt downloading from:
    #> Data saved at /tmp/sanity/RoadSafetyData_Accidents_2015/Accidents_2015.csv
    #> Reading in:
    # ....
    #> Data saved at /tmp/sanity/DfTRoadSafety_Vehicles_2019/Road Safety Data- Vehicles 2019.csv
    l = lapply(checks, function(x) list(
        caNotINac = length(x[[1]]),
        veNotINac = length(x[[2]]),
        acNotINca = length(x[[3]]),
        acNotINve = length(x[[4]])
    ))
    d = as.data.frame(matrix(unlist(l), nrow=length(unlist(l[1]))))
    names(d) = 2015:2019
    rownames(d) = n
    d
#>           2015 2016 2017 2018 2019
#> caNotINac   37   11   25   76   42
#> veNotINac   53   12   36   96   54
#> acNotINca    0    0    0    0    0
#> acNotINve    0    0    0    0    0

Created on 2020-10-21 by the reprex package (v0.3.0)

Session info
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.0.2 (2020-06-22)
#>  os       Ubuntu 20.04.1 LTS          
#>  system   x86_64, linux-gnu           
#>  ui       X11                         
#>  language (EN)                        
#>  collate  C.UTF-8                     
#>  ctype    C.UTF-8                     
#>  tz       Etc/UTC                     
#>  date     2020-10-21                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date       lib source                           
#>  assertthat    0.2.1   2019-03-21 [2] CRAN (R 4.0.2)                   
#>  backports     1.1.10  2020-09-15 [2] CRAN (R 4.0.2)                   
#>  callr         3.4.4   2020-09-07 [2] CRAN (R 4.0.2)                   
#>  class         7.3-17  2020-04-26 [2] CRAN (R 4.0.2)                   
#>  classInt      0.4-3   2020-04-07 [2] CRAN (R 4.0.2)                   
#>  cli           2.1.0   2020-10-12 [1] CRAN (R 4.0.2)                   
#>  crayon        1.3.4   2017-09-16 [2] CRAN (R 4.0.2)                   
#>  DBI           1.1.0   2019-12-15 [2] CRAN (R 4.0.2)                   
#>  desc          1.2.0   2018-05-01 [2] CRAN (R 4.0.2)                   
#>  devtools      2.3.2   2020-09-18 [2] CRAN (R 4.0.2)                   
#>  digest        0.6.26  2020-10-17 [1] CRAN (R 4.0.2)                   
#>  dplyr         1.0.2   2020-08-18 [2] CRAN (R 4.0.2)                   
#>  e1071         1.7-4   2020-10-14 [1] CRAN (R 4.0.2)                   
#>  ellipsis      0.3.1   2020-05-15 [2] CRAN (R 4.0.2)                   
#>  evaluate      0.14    2019-05-28 [2] CRAN (R 4.0.2)                   
#>  fansi         0.4.1   2020-01-08 [2] CRAN (R 4.0.2)                   
#>  fs            1.5.0   2020-07-31 [2] CRAN (R 4.0.2)                   
#>  generics      0.0.2   2018-11-29 [2] CRAN (R 4.0.2)                   
#>  glue          1.4.2   2020-08-27 [2] CRAN (R 4.0.2)                   
#>  highr         0.8     2019-03-20 [2] CRAN (R 4.0.2)                   
#>  hms           0.5.3   2020-01-08 [2] CRAN (R 4.0.2)                   
#>  htmltools     0.5.0   2020-06-16 [2] CRAN (R 4.0.2)                   
#>  KernSmooth    2.23-17 2020-04-26 [2] CRAN (R 4.0.2)                   
#>  knitr         1.29    2020-06-23 [2] CRAN (R 4.0.2)                   
#>  lifecycle     0.2.0   2020-03-06 [2] CRAN (R 4.0.2)                   
#>  magrittr      1.5     2014-11-22 [2] CRAN (R 4.0.2)                   
#>  memoise       1.1.0   2017-04-21 [2] CRAN (R 4.0.2)                   
#>  pillar        1.4.6   2020-07-10 [2] CRAN (R 4.0.2)                   
#>  pkgbuild      1.1.0   2020-07-13 [2] CRAN (R 4.0.2)                   
#>  pkgconfig     2.0.3   2019-09-22 [2] CRAN (R 4.0.2)                   
#>  pkgload       1.1.0   2020-05-29 [2] CRAN (R 4.0.2)                   
#>  prettyunits   1.1.1   2020-01-24 [2] CRAN (R 4.0.2)                   
#>  processx      3.4.4   2020-09-03 [2] CRAN (R 4.0.2)                   
#>  ps            1.3.4   2020-08-11 [2] CRAN (R 4.0.2)                   
#>  purrr         0.3.4   2020-04-17 [2] CRAN (R 4.0.2)                   
#>  R6            2.4.1   2019-11-12 [2] CRAN (R 4.0.2)                   
#>  Rcpp          1.0.5   2020-07-06 [2] CRAN (R 4.0.2)                   
#>  readr         1.4.0   2020-10-05 [1] CRAN (R 4.0.2)                   
#>  remotes       2.2.0   2020-07-21 [2] CRAN (R 4.0.2)                   
#>  rlang         0.4.8   2020-10-08 [1] CRAN (R 4.0.2)                   
#>  rmarkdown     2.3     2020-06-18 [2] CRAN (R 4.0.2)                   
#>  rprojroot     1.3-2   2018-01-03 [2] CRAN (R 4.0.2)                   
#>  sessioninfo   1.1.1   2018-11-05 [2] CRAN (R 4.0.2)                   
#>  sf            0.9-6   2020-09-13 [2] CRAN (R 4.0.2)                   
#>  stats19     * 1.3.0   2020-10-21 [1] Github (ropensci/stats19@b351484)
#>  stringi       1.5.3   2020-09-09 [2] CRAN (R 4.0.2)                   
#>  stringr       1.4.0   2019-02-10 [2] CRAN (R 4.0.2)                   
#>  testthat      2.3.2   2020-03-02 [2] CRAN (R 4.0.2)                   
#>  tibble        3.0.4   2020-10-12 [1] CRAN (R 4.0.2)                   
#>  tidyselect    1.1.0   2020-05-11 [2] CRAN (R 4.0.2)                   
#>  units         0.6-7   2020-06-13 [2] CRAN (R 4.0.2)                   
#>  usethis       1.6.3   2020-09-17 [2] CRAN (R 4.0.2)                   
#>  vctrs         0.3.4   2020-08-29 [2] CRAN (R 4.0.2)                   
#>  withr         2.2.0   2020-04-20 [2] CRAN (R 4.0.2)                   
#>  xfun          0.17    2020-09-09 [2] CRAN (R 4.0.2)                   
#>  yaml          2.2.1   2020-02-01 [2] CRAN (R 4.0.2)                   
#> 
#> [1] /home/robin/R/x86_64-pc-linux-gnu-library/4.0
#> [2] /opt/R/4.0.2/lib/R/library

Heads-up @layik I think there's a simple solution: don't remove the accident records with NAs in the coordinates. The process of converting them to sf objects removes some crashes as shown below:

library(stats19)
#> Data provided under OGL v3.0. Cite the source and link to:
#> www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
    #> Data provided under OGL v3.0. Cite the source and link to:
    #> www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
    n = c("caNotINac","veNotINac","acNotINca", "acNotINve")
    check = function(year = 2019) {
        if(is.null(year) || is.na(year)) stop("Year is required")
        ac = get_stats19(year = year, type = "ac", data_dir = "/tmp/sanity")
        ca = get_stats19(year = year, type = "ca", data_dir = "/tmp/sanity")
        ve = get_stats19(year = year, type = "ve", data_dir = "/tmp/sanity")
        
        # all(ca$accident_index %in% ac$accident_index)
        # all(ve$accident_index %in% ac$accident_index)
        
        caNotINac = which(!ca$accident_index %in% ac$accident_index)
        veNotINac = which(!ve$accident_index %in% ac$accident_index)
        acNotINca = which(!ac$accident_index %in% ca$accident_index)
        acNotINve = which(!ac$accident_index %in% ve$accident_index)
        v = list(caNotINac, veNotINac, acNotINca, acNotINve)
        names(v) = n
        return(v)
    }
    checks = lapply(2015:2019, function(x) check(x))
#> Files identified: RoadSafetyData_Accidents_2015.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Accidents_2015.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/RoadSafetyData_Accidents_2015/Accidents_2015.csv
#> Reading in:
#> /tmp/sanity/RoadSafetyData_Accidents_2015/Accidents_2015.csv
#> date and time columns present, creating formatted datetime column
#> Files identified: RoadSafetyData_Casualties_2015.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Casualties_2015.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/RoadSafetyData_Casualties_2015/Casualties_2015.csv
#> Files identified: RoadSafetyData_Vehicles_2015.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Vehicles_2015.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/RoadSafetyData_Vehicles_2015/Vehicles_2015.csv
#> Files identified: dftRoadSafety_Accidents_2016.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafety_Accidents_2016.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafety_Accidents_2016/dftRoadSafety_Accidents_2016.csv
#> Reading in:
#> /tmp/sanity/dftRoadSafety_Accidents_2016/dftRoadSafety_Accidents_2016.csv
#> date and time columns present, creating formatted datetime column
#> Files identified: dftRoadSafetyData_Casualties_2016.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Casualties_2016.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Casualties_2016/Cas.csv
#> Files identified: dftRoadSafetyData_Vehicles_2016.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Vehicles_2016.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Vehicles_2016/Veh.csv
#> Files identified: dftRoadSafetyData_Accidents_2017.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Accidents_2017.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Accidents_2017/Acc.csv
#> Reading in:
#> /tmp/sanity/dftRoadSafetyData_Accidents_2017/Acc.csv
#> date and time columns present, creating formatted datetime column
#> Files identified: dftRoadSafetyData_Casualties_2017.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Casualties_2017.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Casualties_2017/Cas.csv
#> Files identified: dftRoadSafetyData_Vehicles_2017.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Vehicles_2017.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Vehicles_2017/Veh.csv
#> Files identified: dftRoadSafetyData_Accidents_2018.csv
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Accidents_2018.csv
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Accidents_2018.csv
#> Reading in:
#> /tmp/sanity/dftRoadSafetyData_Accidents_2018.csv
#> date and time columns present, creating formatted datetime column
#> Files identified: dftRoadSafetyData_Casualties_2018.csv
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Casualties_2018.csv
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Casualties_2018.csv
#> Files identified: dftRoadSafetyData_Vehicles_2018.csv
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/dftRoadSafetyData_Vehicles_2018.csv
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/dftRoadSafetyData_Vehicles_2018.csv
#> Files identified: DfTRoadSafety_Accidents_2019.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/DfTRoadSafety_Accidents_2019.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/DfTRoadSafety_Accidents_2019/Road Safety Data - Accidents 2019.csv
#> Reading in:
#> /tmp/sanity/DfTRoadSafety_Accidents_2019/Road Safety Data - Accidents 2019.csv
#> date and time columns present, creating formatted datetime column
#> Files identified: DfTRoadSafety_Casualties_2019.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/DfTRoadSafety_Casualties_2019.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/DfTRoadSafety_Casualties_2019/Road Safety Data - Casualties 2019.csv
#> Files identified: DfTRoadSafety_Vehicles_2019.zip
#>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/DfTRoadSafety_Vehicles_2019.zip
#> Data already exists in data_dir, not downloading
#> Data saved at /tmp/sanity/DfTRoadSafety_Vehicles_2019/Road Safety Data- Vehicles 2019.csv
    #> Files identified: RoadSafetyData_Accidents_2015.zip
    #>    http://data.dft.gov.uk.s3.amazonaws.com/road-accidents-safety-data/RoadSafetyData_Accidents_2015.zip
    #> Attempt downloading from:
    #> Data saved at /tmp/sanity/RoadSafetyData_Accidents_2015/Accidents_2015.csv
    #> Reading in:
    # ....
    #> Data saved at /tmp/sanity/DfTRoadSafety_Vehicles_2019/Road Safety Data- Vehicles 2019.csv
    l = lapply(checks, function(x) list(
        caNotINac = length(x[[1]]),
        veNotINac = length(x[[2]]),
        acNotINca = length(x[[3]]),
        acNotINve = length(x[[4]])
    ))
    d = as.data.frame(matrix(unlist(l), nrow=length(unlist(l[1]))))
    names(d) = 2015:2019
    rownames(d) = n
    d
#>           2015 2016 2017 2018 2019
#> caNotINac    0    0    0    0    0
#> veNotINac    0    0    0    0    0
#> acNotINca    0    0    0    0    0
#> acNotINve    0    0    0    0    0

Created on 2020-10-21 by the reprex package (v0.3.0)

layik commented

Great! Will update the rrsrr chapter accordingly. Thanks @Robinlovelace.

Good stuff, I've already made some updates, an important but as yet unfinished chapter so any updates very welcome. Many thanks @layik !