dcooley/sfheaders

performance improvements

Opened this issue · 3 comments

library(sf)
library(sfheaders)

sf_lines <- mapdeck::roads[1:5000, ]

microbenchmark(

  df_cols = {
    df <- sfheaders::sf_to_df( sf_lines[, c("EZI_RDNAME", "FQID", "FROM_UFI","LEFT_LOC")], fill = TRUE )
    sf_poly <- sfheaders::sfc_polygon(
      obj = df, x = "x", y = "y", polygon_id = "sfg_id", close = FALSE#, keep = FALSE
      )
  },
  df_no_cols = {
    df <- sfheaders::sfc_to_df( sf_lines$geometry )
    sf_poly <- sfheaders::sfc_polygon(
      obj = df, x = "x", y = "y", polygon_id = "sfg_id", close = FALSE
      )
  },
  times = 5
)

# Unit: seconds
#       expr      min       lq     mean   median       uq      max neval
#    df_cpls 7.669546 8.339877 8.547827 8.858592 8.930842 8.940277     5
# df_no_cols 3.539817 3.585354 3.826810 3.709092 3.794609 4.505177     5
  • possibly my subset_dataframe() function. If we're not keeping the columns at the end, we can get rid of them at the start, rather than keeping them all in each iteration.
  • can probably subset the input df at the start just by the geometry_columns and id columns, then use the remaining columns after to 'fill / keep'

TODO

  • sf_to_df() slows down for large POINT objects - #52
  • remove unnecessary columns from subset_dataframe()
  • test performance of matrix subsets too

to_df performs well against cast(, "POINT")

library(sf)
library(mapdeck)
library(microbenchmark)

r <- mapdeck::roads

microbenchmark(
  
  cast = {
    sfheaders::sf_cast( r, to = "point" )
  },
  to_df = {
    sfheaders::sf_to_df( r, fill = TRUE )
  }
)

# Unit: milliseconds
#  expr      min       lq     mean   median       uq      max neval
#  cast 36.15544 43.41139 53.95819 49.19889 57.00329 183.3839   100
# to_df 33.50790 40.04940 50.19918 47.14100 54.02317 186.2511   100

nc <- sf::st_read( system.file( "./shape/nc.shp", package = "sf"))

nc <- rbind( nc, nc, nc, nc, nc, nc, nc, nc, nc, nc )
nc <- rbind( nc, nc, nc, nc, nc, nc, nc, nc, nc, nc )

microbenchmark(
  cast = {
    sfheaders::sf_cast( nc, to = "point" )
  },
  to_df = {
    sfheaders::sf_to_df( nc, fill = TRUE )
  },
  times = 5
)

# Unit: milliseconds
#  expr      min       lq     mean   median       uq      max neval
#  cast 149.0014 189.0628 268.8441 203.0699 390.8134 412.2731     5
# to_df 104.1424 108.2034 141.8837 111.8821 119.6658 265.5251     5

yeah that's it, subset_dataframe() doesn't need to include EVERY column EVERY iteration

We only need the columns used to build the sfg object

   //Rcpp::StringVector df_names = df.names();

    // before going into this loop I only need the columns from `df` which will make the geometries
    // so I can get rid of all teh others?
    Rcpp::StringVector keep_columns = sfheaders::utils::concatenate_vectors( geometry_cols, linestring_id );
    Rcpp::DataFrame df_keep = df[ keep_columns ];
    Rcpp::StringVector df_names = df_keep.names();


    for( i = 0; i < n_polygons; ++i ) {
      start = polygon_positions( i, 0 );
      end = polygon_positions( i, 1 );
      // Rcpp::Rcout << "subsetting" << std::endl;
      Rcpp::DataFrame df_subset = sfheaders::utils::subset_dataframe( df_keep, df_names, start, end );
      // Rcpp::Rcout << "subset" << std::endl;
      sfc( i ) = sfheaders::sfg::sfg_polygon( df_subset, geometry_cols, linestring_id, close );
    }
microbenchmark(

  df_cols = {
    df <- sfheaders::sf_to_df( sf_lines, fill = TRUE )
    sf_poly <- sfheaders::sfc_polygon(
      obj = df, x = "x", y = "y", polygon_id = "sfg_id", linestring_id = "linestring_id", close = FALSE#, keep = FALSE
      )
  },
  df_no_cols = {
    df <- sfheaders::sfc_to_df( sf_lines$geometry )
    sf_poly <- sfheaders::sfc_polygon(
      obj = df, x = "x", y = "y", polygon_id = "sfg_id", close = FALSE
      )
  },
  times = 5
)

# Unit: milliseconds
#       expr      min       lq     mean   median       uq      max neval
#    df_cols 473.2646 506.1085 534.9238 522.2457 567.5169 605.4832     5
# df_no_cols 477.7957 491.4846 521.8224 530.8541 546.8970 562.0803     5

done

library(sf)
library(sfheaders)

sf_lines <- mapdeck::roads[1:5000, ]

microbenchmark::microbenchmark(
  
  df_cols = {
    df <- sfheaders::sf_to_df( sf_lines[, c("EZI_RDNAME", "FQID", "FROM_UFI","LEFT_LOC")], fill = TRUE )
    sf_poly <- sfheaders::sfc_polygon(
      obj = df, x = "x", y = "y", polygon_id = "sfg_id", close = FALSE#, keep = FALSE
    )
  },
  df_no_cols = {
    df <- sfheaders::sfc_to_df( sf_lines$geometry )
    sf_poly <- sfheaders::sfc_polygon(
      obj = df, x = "x", y = "y", polygon_id = "sfg_id", close = FALSE
    )
  },
  times = 5
)

# Unit: seconds
#       expr      min       lq     mean   median       uq      max neval
#    df_cols 1.344446 1.353178 1.372559 1.379976 1.382399 1.402795     5
# df_no_cols 1.336737 1.375887 1.406624 1.397327 1.432684 1.490484     5