hypertidy/silicate

dm as the data model engine

Opened this issue · 6 comments

This looks very promising:

## devtools::install_github("krlmlr/dm")


  library(silicate)
  sc <- SC(minimal_mesh)

  library(dm)

  unclass(sc) %>% as_dm() %>% 
  cdm_add_pk(object, object_) %>% 
  cdm_add_fk(object_link_edge, object_, object) %>% 
  cdm_add_pk(edge, edge_) %>% 
  cdm_add_fk(object_link_edge, edge_, edge) %>% 
  cdm_add_pk(vertex, vertex_) %>% 
  cdm_add_fk(edge, .vx0, vertex) %>% 
  cdm_add_fk(edge, .vx1, vertex)  %>% 
  cdm_draw()

image

tri <- TRI(minimal_mesh)
unclass(tri) %>% as_dm() %>% 
  cdm_add_pk(object, object_) %>% 
  cdm_add_fk(triangle, object_, object) %>% 
  cdm_add_pk(vertex, vertex_) %>% 
  cdm_add_fk(triangle, .vx0, vertex) %>% 
  cdm_add_fk(triangle, .vx1, vertex) %>% 
  cdm_add_fk(triangle, .vx2, vertex) %>% 
  cdm_draw()
  

image

With osmdata we need to turn off checks because object_ is not unique or exclusive.

  library(osmdata)
  
 sci <-  opq ("hampi india") %>%
    add_osm_feature (key="historic", value="ruins") %>%
    osmdata_sc () 
 
    unclass(sci) %>% as_dm() %>% 
    cdm_add_pk(object, object_, check = F) %>% 
    cdm_add_fk(object_link_edge, object_, object, check = FALSE) %>% 
    cdm_add_pk(edge, edge_) %>% 
    cdm_add_fk(object_link_edge, edge_, edge) %>% 
    cdm_add_pk(vertex, vertex_) %>% 
    cdm_add_fk(edge, .vx0, vertex) %>% 
    cdm_add_fk(edge, .vx1, vertex)

    ── Table source ────────────────────────────────────────────────────────────────────────────────────────
    src:  <environment: 0x55b33d6bdce0>
      ── Data model ──────────────────────────────────────────────────────────────────────────────────────────
    Data model object:
      8 tables:  edge, meta, nodes, object ... 
    27 columns
    3 primary keys
    4 references
    ── Rows ────────────────────────────────────────────────────────────────────────────────────────────────
    Total: 489
    edge: 127, meta: 1, nodes: 39, object: 45, object_link_edge: 127, relation_members: 2, relation_properties: 3, vertex: 145
    

@mpadge basically what I was trying to do with https://github.com/hypertidy/rbot but obviously much more promising!

Methods for as_dm

  library(silicate)

  as_dm.SC <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(object_link_edge, object_, object) %>% 
      cdm_add_pk(edge, edge_) %>% 
      cdm_add_fk(object_link_edge, edge_, edge) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(edge, .vx0, vertex) %>% 
      cdm_add_fk(edge, .vx1, vertex) 
  }
  as_dm.TRI <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(triangle, object_, object) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(triangle, .vx0, vertex) %>% 
      cdm_add_fk(triangle, .vx1, vertex) %>% 
      cdm_add_fk(triangle, .vx2, vertex)
  }
  
  as_dm.ARC <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(object_link_arc, object_, object) %>% 
      ## here we need table normalization (somehow, possibly by composing ARC from SC)
      cdm_add_pk(arc_link_vertex, arc_, check = FALSE) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(arc_link_vertex, vertex_, vertex) 
  }
  
  as_dm.PATH <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(path, object_, object) %>% 
      ## here we need table normalization (somehow, possibly by composing PATH from SC)
      cdm_add_pk(path_link_vertex, path_, check = FALSE) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(path_link_vertex, vertex_, vertex) 
  }
  
  library(dm)
as_dm(SC(minimal_mesh))
as_dm(TRI(minimal_mesh))
as_dm(ARC(minimal_mesh))
as_dm(PATH(minimal_mesh))

decompose_table is similar to unjoin

unjoin::unjoin(mtcars, am, gear, carb, key_col = "parent_table")

decompose_table(mtcars, new_id, am, gear, carb)

That's awesome - I hadn't even seen that

We don't currently handle cycles (=parallel edges in this case) very well. Would a "longer form" be suitable for the triangle?

I've actually grappled with that from very early on, and pretty sure I started with a long triangle form. A follow up question, when you declare which table to dm_filter() have you considered a tidygraph::activate workflow, so that a given table is put upfront and stays there?

That originally seemed to me to be the way to go, but maybe it's better to always declare the table in these verbs?

What follows is just here as a note to self, it took me a bit of to and fro to get it working, but it's worth exploring.

Here I try it out on-the-fly with conversion to and from dm, with a filter on object that culls triangles and vertices:

  as_dm_TRI_longform <- function(x) {
  x <- unclass(x)
  x$triangle <- x$triangle %>% 
    tidyr::pivot_longer(starts_with(".vx"),  
                                    names_to = "corner", values_to = "vertex_")
  
  x %>% as_dm() %>% 
    dm_add_pk(object, object_) %>% 
    dm_add_fk(triangle, object_, object) %>% 
    dm_add_pk(vertex, vertex_) %>% 
    dm_add_fk(triangle, vertex_, vertex)
}
as_TRIlongform_dm <- function(x) {
  x <- dm_apply_filters(x) %>% dm_get_tables()
  x$triangle <- x$triangle %>% 
    tidyr::pivot_wider(names_from = corner, 
                values_from = vertex_) %>% 
    tidyr::unnest(cols = c(.vx0, .vx1, .vx2))
  class(x) <- c("TRI", "sc")
  x
}
library(silicate)
#> 
#> Attaching package: 'silicate'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dm)
#> 
#> Attaching package: 'dm'
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
tri <- TRI(minimal_mesh)
## a dm version of TRI
x <- as_dm_TRI_longform(tri) 
x
#> ── Table source ─────────────────────────────────────────────────────────
#> src:  <environment: R_GlobalEnv>
#> ── Metadata ─────────────────────────────────────────────────────────────
#> Tables: `object`, `triangle`, `vertex`, `meta`
#> Columns: 11
#> Primary keys: 2
#> Foreign keys: 2
#validate_dm(x)
#dm_get_tables(x)

## round-trip
tri_f <- x %>% dm_filter(object, a == 1) %>% as_TRIlongform_dm()
#> Warning: Values in `vertex_` are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list(vertex_ = list)` to suppress this warning.
#> * Use `values_fn = list(vertex_ = length)` to identify where the duplicates arise
#> * Use `values_fn = list(vertex_ = summary_fun)` to summarise duplicates
par(mfrow = c(1, 2))
plot(tri, col = grey.colors(nrow(tri$triangle)))
plot(tri_f, col = grey.colors(nrow(tri_f$triangle)))

Created on 2020-03-21 by the reprex package (v0.3.0)

This is pretty good

  ## convert TRI to a longform triangle and then to dm
  as_dm_TRI_longform <- function(x) {
  x <- unclass(x)
  x$triangle <- x$triangle %>% 
    tidyr::pivot_longer(starts_with(".vx"),  
                                    names_to = "corner", values_to = "vertex_")
  
  x %>% as_dm() %>% 
    dm_add_pk(object, object_) %>% 
    dm_add_fk(triangle, object_, object) %>% 
    dm_add_pk(vertex, vertex_) %>% 
    dm_add_fk(triangle, vertex_, vertex)
  }
 ## convert a dm with longform TRI to TRI 
 as_TRIlongform_dm <- function(x) {
  x <- x %>% dm_apply_filters() %>% dm_get_tables() %>% 
    purrr::map(dplyr::collect)
    
  x$triangle <- x$triangle %>% 
    tidyr::pivot_wider(names_from = corner, 
                values_from = vertex_) %>% 
    tidyr::unnest(cols = c(.vx0, .vx1, .vx2))
  class(x) <- c("TRI", "sc")
  x
}
library(silicate)
#> 
#> Attaching package: 'silicate'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dm)
#> 
#> Attaching package: 'dm'
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
 
 ## sf polygons of Provinces, in triangulated form (just because)
 tri <- TRI(inlandwaters)
 ## a dm version of TRI
 x <- as_dm_TRI_longform(tri) 
 x
#> ── Table source ─────────────────────────────────────────────────────────
#> src:  <environment: R_GlobalEnv>
#> ── Metadata ─────────────────────────────────────────────────────────────
#> Tables: `object`, `triangle`, `vertex`, `meta`
#> Columns: 12
#> Primary keys: 2
#> Foreign keys: 2
 ## unlink("afile.sql3")
 src <- dplyr::src_sqlite("afile.sql3", create = TRUE)
 sc <- copy_dm_to(src, x, temporary = FALSE)

 rm(tri, x)
 pryr::object_size(sc)
#> Registered S3 method overwritten by 'pryr':
#>   method      from
#>   print.bytes Rcpp
#> 19.2 kB
 file.info("afile.sql3")$size/1e6
#> [1] 7.589888

 ## apply filters to the object, and vertex tables and collect as TRI
tas_north <- sc %>% 
  dm_filter(object, Province == "Tasmania") %>% 
  dm_filter(vertex, y_ > -1500000) %>% 
  as_TRIlongform_dm()
#> Warning: Values in `vertex_` are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list(vertex_ = list)` to suppress this warning.
#> * Use `values_fn = list(vertex_ = length)` to identify where the duplicates arise
#> * Use `values_fn = list(vertex_ = summary_fun)` to summarise duplicates

  
## in the original sf "Province == "Tasmania" is a widely distributed
## set of islands, particularly the very tiny and to the far
## SE Macquarie Island, so we pick Tas and then zoom up to the north
## of the main islands of the province
par(mfrow = c(1, 2))
library(sf); plot(inlandwaters[5, 1]$geom, col = "grey")
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 7.0.0
plot(tas_north)

Created on 2020-03-21 by the reprex package (v0.3.0)