bionomia/bionomia

Redesign the JSON-LD output for occurrence records

Closed this issue · 7 comments

The occurrence-level JSON-LD output is nor semantically-correct. Better would be to present as an annotation such that we can have metadata on the link between subject and object as desired.

@rdmpage Am toying with a more semantically appropriate way to represent JSON-LD from Bionomia w/o breaking downstream re-use. I saw your README at https://github.com/rdmpage/wild-json-ld.

Besides the nasty context for now, does this seem better? The doubly nested reverse is...ugh. But, have no idea how to better represent this.

{
 "@context":{
  "@vocab":"http://schema.org/",
  "identified":"http://rs.tdwg.org/dwc/iri/identifiedBy",
  "recorded":"http://rs.tdwg.org/dwc/iri/recordedBy",
  "PreservedSpecimen":"http://rs.tdwg.org/dwc/terms/PreservedSpecimen",
  "as":"https://www.w3.org/ns/activitystreams#",
  "oa":"http://www.w3.org/ns/oa#",
  "annotation":"http://www.w3.org/ns/oa#Annotation",
  "datasetKey":"http://rs.gbif.org/terms/1.0/datasetKey",
  "occurrenceID":"http://rs.tdwg.org/dwc/terms/occurrenceID",
  "basisOfRecord":"http://rs.tdwg.org/dwc/terms/basisOfRecord",
  "dateIdentified":"http://rs.tdwg.org/dwc/terms/dateIdentified",
  "decimalLatitude":"http://rs.tdwg.org/dwc/terms/decimalLatitude",
  "decimalLongitude":"http://rs.tdwg.org/dwc/terms/decimalLongitude",
  "country":"http://rs.tdwg.org/dwc/terms/country",
  "countryCode":"http://rs.tdwg.org/dwc/terms/countryCode",
  "eventDate":"http://rs.tdwg.org/dwc/terms/eventDate",
  "year":"http://rs.tdwg.org/dwc/terms/year",
  "kingdom":"http://rs.tdwg.org/dwc/terms/kingdom",
  "family":"http://rs.tdwg.org/dwc/terms/family",
  "identifiedBy":"http://rs.tdwg.org/dwc/terms/identifiedBy",
  "institutionCode":"http://rs.tdwg.org/dwc/terms/institutionCode",
  "collectionCode":"http://rs.tdwg.org/dwc/terms/collectionCode",
  "catalogNumber":"http://rs.tdwg.org/dwc/terms/catalogNumber",
  "recordedBy":"http://rs.tdwg.org/dwc/terms/recordedBy",
  "scientificName":"http://rs.tdwg.org/dwc/terms/scientificName",
  "typeStatus":"http://rs.tdwg.org/dwc/terms/typeStatus",
  "recordedByID":"http://rs.tdwg.org/dwc/terms/recordedByID",
  "identifiedByID":"http://rs.tdwg.org/dwc/terms/identifiedByID"
 },
 "@type":"Person",
 "@id":"http://localhost:4567/0000-0001-7618-5230",
 "givenName":"David Peter",
 "familyName":"Shorthouse",
 "name":"David Peter Shorthouse",
 "alternateName":[
  "David P. Shorthouse",
  "David Shorthouse",
  "David Peter Shorthouse"
 ],
 "sameAs":"https://orcid.org/0000-0001-7618-5230",
 "as:prev":null,
 "as:current":"http://localhost:4567/0000-0001-7618-5230/specimens.jsonld?page=1",
 "as:next":null,
 "@reverse":{
  "identified":[
   {
    "@type":"PreservedSpecimen",
    "@id":"http://localhost:4567/occurrence/769279710",
    "sameAs":"https://gbif.org/occurrence/769279710",
    "datasetKey":"8971dfba-f762-11e1-a439-00145eb45e9a",
    "occurrenceID":"urn:catalog:UASM:UASM329573",
    "basisOfRecord":"PRESERVED_SPECIMEN",
    "dateIdentified":"2010",
    "decimalLatitude":"56.839",
    "decimalLongitude":"-118.340",
    "country":"Canada",
    "countryCode":"CA",
    "eventDate":"2004-07",
    "year":null,
    "kingdom":"Animalia",
    "family":"Linyphiidae",
    "identifiedBy":"Shorthouse, D.",
    "institutionCode":"University of Alberta Museums (UAM)",
    "collectionCode":"UASM",
    "catalogNumber":"UASM329573",
    "recordedBy":"Pinzon, J.",
    "scientificName":"Oreonetides vaginatus",
    "typeStatus":null,
    "recordedByID":null,
    "identifiedByID":null,
    "@reverse":{
     "annotation":[
      {
       "@type":"oa:Annotation",
       "@id":"BionomiaLink26380456",
       "oa:motivation":"identifying",
       "oa:target":{
        "oa:source":"https://gbif.org/occurrence/769279710",
        "oa:selector":{
         "oa:type":"TextQuoteSelector",
         "oa:exact":"Identified by"
        }
       },
       "oa:creator":{
        "@type":"Person",
        "@id":"http://localhost:4567/0000-0001-7618-5230",
        "sameAs":"https://orcid.org/0000-0001-7618-5230",
        "name":"David Peter Shorthouse"
       },
       "oa:created":"2021-01-18T12:37:30-05:00",
       "oa:modified":null
      }
     ]
    }
   },
   {
    "@type":"PreservedSpecimen",
    "@id":"http://localhost:4567/occurrence/769281222",
    "sameAs":"https://gbif.org/occurrence/769281222",
    "datasetKey":"8971dfba-f762-11e1-a439-00145eb45e9a",
    "occurrenceID":"urn:catalog:UASM:UASM329574",
    "basisOfRecord":"PRESERVED_SPECIMEN",
    "dateIdentified":"2010",
    "decimalLatitude":"56.786",
    "decimalLongitude":"-118.349",
    "country":"Canada",
    "countryCode":"CA",
    "eventDate":"2004-07",
    "year":null,
    "kingdom":"Animalia",
    "family":"Linyphiidae",
    "identifiedBy":"Shorthouse, D.",
    "institutionCode":"University of Alberta Museums (UAM)",
    "collectionCode":"UASM",
    "catalogNumber":"UASM329574",
    "recordedBy":"Pinzon, J.",
    "scientificName":"Oreonetides vaginatus",
    "typeStatus":null,
    "recordedByID":null,
    "identifiedByID":null,
    "@reverse":{
     "annotation":[
      {
       "@type":"oa:Annotation",
       "@id":"BionomiaLink42153317",
       "oa:motivation":"identifying",
       "oa:target":{
        "oa:source":"https://gbif.org/occurrence/769281222",
        "oa:selector":{
         "oa:type":"TextQuoteSelector",
         "oa:exact":"Identified by"
        }
       },
       "oa:creator":{
        "@type":"Person",
        "@id":"http://localhost:4567/0000-0001-7618-5230",
        "sameAs":"https://orcid.org/0000-0001-7618-5230",
        "name":"David Peter Shorthouse"
       },
       "oa:created":"2021-03-12T10:11:13-05:00",
       "oa:modified":null
      }
     ]
    }
   },
   {
    "@type":"PreservedSpecimen",
    "@id":"http://localhost:4567/occurrence/769279986",
    "sameAs":"https://gbif.org/occurrence/769279986",
    "datasetKey":"8971dfba-f762-11e1-a439-00145eb45e9a",
    "occurrenceID":"urn:catalog:UASM:UASM329612",
    "basisOfRecord":"PRESERVED_SPECIMEN",
    "dateIdentified":"2010",
    "decimalLatitude":"56.785",
    "decimalLongitude":"-118.355",
    "country":"Canada",
    "countryCode":"CA",
    "eventDate":"2004-07",
    "year":null,
    "kingdom":"Animalia",
    "family":"Linyphiidae",
    "identifiedBy":"Shorthouse, D.",
    "institutionCode":"University of Alberta Museums (UAM)",
    "collectionCode":"UASM",
    "catalogNumber":"UASM329612",
    "recordedBy":"Pinzon, J.",
    "scientificName":"Tunagyna debilis",
    "typeStatus":null,
    "recordedByID":null,
    "identifiedByID":null,
    "@reverse":{
     "annotation":[
      {
       "@type":"oa:Annotation",
       "@id":"BionomiaLink112282193",
       "oa:motivation":"identifying",
       "oa:target":{
        "oa:source":"https://gbif.org/occurrence/769279986",
        "oa:selector":{
         "oa:type":"TextQuoteSelector",
         "oa:exact":"Identified by"
        }
       },
       "oa:creator":{
        "@type":"Person",
        "@id":"http://localhost:4567/0000-0001-7618-5230",
        "sameAs":"https://orcid.org/0000-0001-7618-5230",
        "name":"David Peter Shorthouse"
       },
       "oa:created":"2021-09-22T08:15:12-04:00",
       "oa:modified":null
      }
     ]
    }
   }
  ],
  "recorded":[
  ]
 }
}

@dshorthouse This sort of thing does my head in :( I think the first @reverse works fine, it generates the statements you want (e.g., some occurrence dwciri:identifiedBy some person).

I'm not sure I buy the annotation, either the way it's shown here, or whether annotation is actually what you want. When I translate this into triples I get things like:

<https://json-ld.org/playground/BionomiaLink42153317> <http://www.w3.org/ns/oa#Annotation> <http://localhost:4567/occurrence/769281222> .

which misuses <http://www.w3.org/ns/oa#Annotation>, it's not a predicate, it's the class of annotations. The connect between an occurrence and an annotation as you've modelled it is oa:source, which could be used instead of annotation in the @reverse statement, but then you're in trouble because the tree-structure would require yet a further @reverse for @target before you get to the annotation! So I think this way lies madness, and the tree structure starts to get unmanageable.

I'm wondering why use annotations to model this? I don't think these are annotations, what you're really trying to do is give the provenance for the assertion some occurrence dwciri:identifiedBy some person. Annotations don't really express that. It feels a bit of a hack to assert dwciri:identifiedBy by annotating some text label on the GBIF page .

I think you might want to think about either (a) using nanopubs to express the provenance of the statement that someone identified something, or (b) maybe use something like schema:Role to handle provenance. Nanopubs are cool but come with some overhead. But they capture what you want. I'm not sure it would be useful trying to embed them into the JSON-LD tree above though.

The other approach is to embed provenance directly in the graph using schema:Role, see "A Scalable Approach to Incrementally Building Knowledge Graphs" 10.1007/978-3-319-43997-6_15 PDF here It's a cool idea, but it breaks everyone's expectation of just getting simple triples. I think it's probably a reasonable model for when we aggregate data across multiple sources and want to track potential conflicts.

Hope some of this makes sense.

Thanks for having a look. Indeed, it's spectacular mess. What I'm really trying to do is say:

some person stated at timestamp that some occurrence dwciri:identifiedBy some person

Give the naked assertion some qualifier as it were. So yes, it is provenance but I do not see how/where I can express the first bit as qualifier unless I consider this some sort of annotation. This would work just fine if I stripped out the some person stated at timestamp qualifier entirely (poorly modelled as an annotation above) but then do I sacrifice trust?

I suppose another way of looking at this is:

What is the source of the assertion that some occurrence dwciri:identifiedBy some person?

This is interesting and might perhaps be what I'm looking for. But, am baffled as to how I'd represent this in JSON-LD.
Screen Shot 2021-09-29 at 8 18 41 AM

The entity on the left evidently has a property that is a list for the many schema:name.

OK this is how I think we'd represent this:

{
	"@context": {
		"@vocab": "http://schema.org/",
		"prov": "http://www.w3.org/TR/prov-o/"
	},

	"@id": "http://thing.org/royLichtenstein",
	"@type": "Person",
	"name": [{
			"@type": "Role",
			"name": "Roy L.",
			"prov:wasGeneratedBy": [
				"http://a.source.org/",
				"http://another.source.org/"
			]
		},
		{
			"@type": "Role",
			"name": "Lichtenstein",
			"prov:wasGeneratedBy": "http://someother.source.org/"
		}

	],
	"birthDate": {
		"@type": "Role",
		"birthDate": "1923-10-27",
		"prov:wasGeneratedBy": "http://a.source.org/"
	}

}

And here it is in triples:

<http://thing.org/royLichtenstein> <http://schema.org/birthDate> _:b0 .
<http://thing.org/royLichtenstein> <http://schema.org/name> _:b1 .
<http://thing.org/royLichtenstein> <http://schema.org/name> _:b2 .
<http://thing.org/royLichtenstein> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
_:b0 <http://schema.org/birthDate> "1923-10-27" .
_:b0 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Role> .
_:b0 <http://www.w3.org/TR/prov-o/wasGeneratedBy> "http://a.source.org/" .
_:b1 <http://schema.org/name> "Roy L." .
_:b1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Role> .
_:b1 <http://www.w3.org/TR/prov-o/wasGeneratedBy> "http://a.source.org/" .
_:b1 <http://www.w3.org/TR/prov-o/wasGeneratedBy> "http://another.source.org/" .
_:b2 <http://schema.org/name> "Lichtenstein" .
_:b2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Role> .
_:b2 <http://www.w3.org/TR/prov-o/wasGeneratedBy> "http://someother.source.org/" .

And as a graph:

graphviz

In a sense this is a classic case of indirection, schema:name doesn't point to the value for the name, it points to a pointer to the value for the name. AFAIK the choice of schema:Role as the type of the intermediate node is arbitrary, but it has been used in various ways to do this sort of thing, e.g. http://blog.schema.org/2014/06/introducing-role.html I also used this in Ozymandias to model order of authorship, but now I've switched to using RDF lists instead.

OK, I think am getting closer. A big part of the messiness here is that the root entity is of type PreservedSpecimen that has a DwC-based property recordedBy, which may be interpreted lists of other entities of type Person whose association with that property recordedBy was created by yet other Person via schema:Role.

Reverted back to original JSON-LD output without the arguably messy, annotation component(s).