Download TEI document not working

Question

Download TEI document not working

Closed this issue 2 years ago · 5 comments

I create an item Bates34 and uploaded 3 images to it. I checked that they processed, interacted with them a bit and then hit "Download the TEI document" and got back the following document.

That is - JSON LD metadata injected in but the document body is empty. No errors on backend.

<?xml version="1.0" encoding="UTF-8"?><TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:c="http://www.w3.org/ns/xproc-step" xmlns:fn="http://www.w3.org/2005/xpath-functions" xmlns:z="https://github.com/Conal-Tuohy/XProc-Z" xml:id="Bates34"><teiHeader><fileDesc><titleStmt><title>My Research Object Crate</title></titleStmt><publicationStmt><publisher><ref target="https://nyingarn.net/">Nyingarn</ref></publisher></publicationStmt><sourceDesc><p/></sourceDesc></fileDesc><xenoData type="application/ld+json">{
  "@graph": [
    {
      "identifier": "ro-crate-metadata.json",
      "@type": "CreativeWork",
      "about": { "@id": ".\/" },
      "@id": "ro-crate-metadata.json",
      "conformsTo": { "@id": "https:\/\/w3id.org\/ro\/crate\/1.1" }
    },
    {
      "hasPart": [
        { "@id": "Bates34-005.jpg" },
        { "@id": "Bates34-005.thumbnail_h300.jpg" },
        { "@id": "Bates34-005.webp" },
        { "@id": "Bates34-005.tei.xml" },
        { "@id": "Bates34-005.textract_ocr-ADMIN.json" },
        { "@id": "Bates34-004.jpg" },
        { "@id": "Bates34-004.thumbnail_h300.jpg" },
        { "@id": "Bates34-004.webp" },
        { "@id": "Bates34-004.tei.xml" },
        { "@id": "Bates34-004.textract_ocr-ADMIN.json" },
        { "@id": "Bates34-006.jpg" },
        { "@id": "Bates34-006.thumbnail_h300.jpg" },
        { "@id": "Bates34-006.webp" },
        { "@id": "Bates34-006.tei.xml" },
        { "@id": "Bates34-006.textract_ocr-ADMIN.json" }
      ],
      "@type": [ "Dataset" ],
      "name": "My Research Object Crate",
      "@id": ".\/"
    },
    {
      "@type": "File",
      "contentSize": 968428,
      "name": "Bates34-005.jpg",
      "encodingFormat": "image\/jpeg",
      "dateModified": "2023-08-07T01:13:55.000Z",
      "@id": "Bates34-005.jpg",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 12539,
      "name": "Bates34-005.thumbnail_h300.jpg",
      "encodingFormat": "image\/jpeg",
      "dateModified": "2023-08-07T01:13:54.000Z",
      "@id": "Bates34-005.thumbnail_h300.jpg",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 764292,
      "name": "Bates34-005.webp",
      "encodingFormat": "image\/webp",
      "dateModified": "2023-08-07T01:13:57.000Z",
      "@id": "Bates34-005.webp",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 2751,
      "name": "Bates34-005.tei.xml",
      "encodingFormat": "application\/xml",
      "dateModified": "2023-08-07T01:14:00.000Z",
      "@id": "Bates34-005.tei.xml",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 193406,
      "name": "Bates34-005.textract_ocr-ADMIN.json",
      "encodingFormat": "application\/json",
      "dateModified": "2023-08-07T01:14:00.000Z",
      "@id": "Bates34-005.textract_ocr-ADMIN.json",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 1061697,
      "name": "Bates34-004.jpg",
      "encodingFormat": "image\/jpeg",
      "dateModified": "2023-08-07T01:14:02.000Z",
      "@id": "Bates34-004.jpg",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 11240,
      "name": "Bates34-004.thumbnail_h300.jpg",
      "encodingFormat": "image\/jpeg",
      "dateModified": "2023-08-07T01:14:02.000Z",
      "@id": "Bates34-004.thumbnail_h300.jpg",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 874920,
      "name": "Bates34-004.webp",
      "encodingFormat": "image\/webp",
      "dateModified": "2023-08-07T01:14:05.000Z",
      "@id": "Bates34-004.webp",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 3572,
      "name": "Bates34-004.tei.xml",
      "encodingFormat": "application\/xml",
      "dateModified": "2023-08-07T01:14:09.000Z",
      "@id": "Bates34-004.tei.xml",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 224546,
      "name": "Bates34-004.textract_ocr-ADMIN.json",
      "encodingFormat": "application\/json",
      "dateModified": "2023-08-07T01:14:09.000Z",
      "@id": "Bates34-004.textract_ocr-ADMIN.json",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 980112,
      "name": "Bates34-006.jpg",
      "encodingFormat": "image\/jpeg",
      "dateModified": "2023-08-07T01:14:11.000Z",
      "@id": "Bates34-006.jpg",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 13008,
      "name": "Bates34-006.thumbnail_h300.jpg",
      "encodingFormat": "image\/jpeg",
      "dateModified": "2023-08-07T01:14:11.000Z",
      "@id": "Bates34-006.thumbnail_h300.jpg",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 783250,
      "name": "Bates34-006.webp",
      "encodingFormat": "image\/webp",
      "dateModified": "2023-08-07T01:14:14.000Z",
      "@id": "Bates34-006.webp",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 2959,
      "name": "Bates34-006.tei.xml",
      "encodingFormat": "application\/xml",
      "dateModified": "2023-08-07T01:14:17.000Z",
      "@id": "Bates34-006.tei.xml",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    },
    {
      "@type": "File",
      "contentSize": 217646,
      "name": "Bates34-006.textract_ocr-ADMIN.json",
      "encodingFormat": "application\/json",
      "dateModified": "2023-08-07T01:14:18.000Z",
      "@id": "Bates34-006.textract_ocr-ADMIN.json",
      "@reverse": { "hasPart": [ { "@id": ".\/" } ] }
    }
  ],
  "@context": [
    "https:\/\/w3id.org\/ro\/crate\/1.1\/context",
    { "@vocab": "http:\/\/schema.org\/" },
    { "txc": "https:\/\/purl.archive.org\/textcommons\/terms#" },
    { "@base": null }
  ]
}</xenoData></teiHeader><text><body/></text></TEI>

Answer 1 · 2023-08-07T01:27:53.000Z

Thanks! I kind of expected it not to work 100% but that's not quite as close as I'd hoped. I realise now that the test case for reassembly of TEI documents is pretty weak (obviously because it gave this a passing mark).

Answer 2 · 2023-08-07T06:06:26.000Z

I tried downloading the Kalsakau TEI and it gives an error message that one of the pages may be invalid. There are 49 pages, so it would be preferable to be told which page is invalid so it can be fixed. I then went through each p[age and saved it, and none threw an error, suggesting they are all valid TEI .

Answer 3 · 2023-08-23T23:32:00.000Z

I wasn't able to replicate this particular issue in my dev environment, so although it does work for me, I'm not going to close the issue just now.

In my dev environment I've been able to create a new item, upload a TEI file into it, edit the resulting pages in Nyingarn, download the file again, edit it offline, re-upload the edited file (after deleting pages already in Nyingarn) and see my edited pages appear in the Nyingarn workspace.

I've updated the "assemble-tei" test to include a complete round-trip scenario, after which it compares the originally uploaded document with the final download, using a schematron schema to validate that they have the same set of div elements as well as the same number of pages with the same identifiers and the same textual content (disregarding whitespace, which the ingester does tidy up a bit). The new test is passing.

See #197

Answer 4 · 2023-08-25T07:20:14.000Z

I've worked out what the issue is: the document won't export as a single TEI file because it was originally produced by OCR.

The reconstitute.xsl stylesheet selects all the surface elements whose @xml:id attribute start with the document's identifier (Bates34, in this case) and which (with a addition of a suffix of .xml) match the regular expression for page identifiers. The identifiers in the document above didn't match that regex, and so none of them were selected for inclusion in the output document, which consequently had an empty body element.

The XSLT-based ingestion process does produce page identifiers that match that regex, but the OCR-based ingestion obviously doesn't (or at least doesn't necessarily).

The reconstitute.xsl stylesheet could dispense with that check and just stick together all the surfaces which are supplied, whatever their identifiers might be. When I remove that filter from the reconstitute.xsl stylesheet, the export does work, but the downloaded file has page break identifiers which don't match the convention and hence produces an error if you try to re-ingest it:

When reporting this problem to us please include the following information in the message (and send us the file that caused the error):

item: Bates34
file: Bates34-tei.xml
error: The document did not contain any pages whose identifiers started with 'Bates34' and matched the regular expression '^[a-z,A-Z][a-z,A-Z,0-9,_]+-[0-9]+[a-z,A-Z,0-9]+..*$'.

Firstly, it seems to me that if we are refusing to ingest a TEI file with a particular set of page identifiers, then we shouldn't allow those same identifiers to have got into Nyingarn through some other route, so I think the OCR ingestion process should be tweaked so that it does generate identifiers that match our expectations (which is, in short, that the xml:id value should be equal to the name of the TEI surface file, without the .tei.xml extension).

Secondly there's the question of how to deal with documents already in storage which don't match the convention.

It seems to me that the reconstitute.xsl stylesheet can actually obtain a good identifier from the name of the surface file itself, and just ignore the xml:id value. That would mean that existing OCR'd documents could be round-tripped, because their page identifiers would be fixed up on export from Nyingarn.

The XML below is a snippet of what the stylesheet is passed when the surface files are uploaded to the XML web service. Note the mismatch between the surface/@xml:id value and the filename specified in the c:body/@disposition attribute which comes from the HTTP multi-part upload, and which naturally does consist of a good identifier, which just needs the .tei.xml extension stripped off it.

<c:body content-type="application/xml" 
	disposition="form-data; name=&#34;source&#34;; filename=&#34;Bates34-001aT.tei.xml&#34;">
	<surface xmlns="http://www.tei-c.org/ns/1.0" xml:id="Bates34-Bates34-001aT.jpg">
		<line>Daisy Bates Papers</line>
		<line>MS 365</line>
		<line>Section XI</line>
		<line>DANCES, SONGS</line>
	</surface>
</c:body>
<c:body content-type="application/xml" 
	disposition="form-data; name=&#34;source&#34;; filename=&#34;Bates34-008.tei.xml&#34;">
	<surface xmlns="http://www.tei-c.org/ns/1.0" xml:id="Bates34-Bates34-008.jpg">
		<line>34/8</line>
                ...
		<line>for any dance, nor are they permitted to touch any ornaments made</line>
	</surface>
</c:body>

Answer 5 · 2023-09-04T00:36:27.000Z

Thanks @Conal-Tuohy

I'll fix the bug in the code and run a script over the repo to fix broken OCR files as well. The pull request is about to be merged in as well.