dbmdz/solr-ocrhighlighting

Solr returns error with empty ALTO, by design ?

giancarlobi opened this issue · 2 comments

@jbaiter I installed new 0.6.0 and I'm doing some check with ALTO. I generate ALTO with pdfalto from PDF. For a blank pdf page I have an empty ALTO as this:

<?xml version="1.0" encoding="UTF-8"?>
<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xsi:schemaLocation="http://www.loc.gov/standards/alto/v3/alto.xsd"><Description><MeasurementUnit>pixel</Measurem
entUnit><sourceImageInformation><fileName>23-n1_343578.pdf</fileName></sourceImageInformation><OCRProcessing ID="IdOcr"><ocrProcessingStep><processingDateTime>2021-05-
13T09:26:19Z</processingDateTime><processingSoftware><softwareCreator>CONTRIBUTORS</softwareCreator><softwareName>pdfalto</softwareName><softwareVersion>0.5</softwareV
ersion></processingSoftware></ocrProcessingStep></OCRProcessing></Description><Styles/><Layout><Page ID="Page6" PHYSICAL_IMG_NR="6" WIDTH="595.276" HEIGHT="841.890"><P
rintSpace/></Page></Layout></alto>

That means without any TextBlock/TextLine/String.
Indexing Solr gives this error:

2021-05-13 09:19:21.360 ERROR (qtp1997859171-641) [   x:archipelago] o.a.s.h.RequestHandlerBase org.apache.solr.common.SolrException: Exception writing document id g0qa5y-default_solr_index-strawberryfield_flavor_datasource/151:6:en:cc9b80b1-bd9a-4657-94df-4d62785e08c5:ocr to the index; possible analysis error.
        at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:256)
        at org.apache.solr.update.processor.RunUpdateProcessorFactory$RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:73)
        at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:55)
        at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:259)
        at org.apache.solr.update.processor.DistributedUpdateProcessor.doVersionAdd(DistributedUpdateProcessor.java:498)
        at org.apache.solr.update.processor.DistributedUpdateProcessor.lambda$versionAdd$0(DistributedUpdateProcessor.java:339)
        at org.apache.solr.update.VersionBucket.runWithLock(VersionBucket.java:50)
        at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:339)
        at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:225)
        at org.apache.solr.update.processor.LogUpdateProcessorFactory$LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:106)
        at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:263)
        at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:190)
        at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:97)
        at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:68)
        at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:214)
        at org.apache.solr.core.SolrCore.execute(SolrCore.java:2627)
        at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:795)
        at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:568)
        at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:415)
        at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
        at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1596)
        at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:545)
        at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
        at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:590)
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
        at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
        at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)
        at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
        at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1300)
        at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
        at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:485)
        at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)
        at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
        at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1215)
        at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
        at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)
        at org.eclipse.jetty.server.handler.InetAccessHandler.handle(InetAccessHandler.java:177)
        at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
        at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:322)
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
        at org.eclipse.jetty.server.Server.handle(Server.java:500)
        at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)
        at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:547)
        at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)
        at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)
        at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
        at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
        at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
        at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:375)
        at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)
        at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Current event not START_ELEMENT
        at com.ctc.wstx.sr.BasicStreamReader.getAttributeValue(BasicStreamReader.java:632)
        at de.digitalcollections.solrocr.formats.alto.AltoParser.readNext(AltoParser.java:59)
        at de.digitalcollections.solrocr.formats.OcrParser.<init>(OcrParser.java:106)
        at de.digitalcollections.solrocr.formats.alto.AltoParser.<init>(AltoParser.java:27)
        at de.digitalcollections.solrocr.formats.alto.AltoFormat.getParser(AltoFormat.java:38)
        at de.digitalcollections.solrocr.model.OcrFormat.filter(OcrFormat.java:90)
        at de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory.create(OcrCharFilterFactory.java:51)
        at org.apache.solr.analysis.TokenizerChain.initReader(TokenizerChain.java:97)
        at org.apache.lucene.analysis.AnalyzerWrapper.initReader(AnalyzerWrapper.java:156)
        at org.apache.lucene.analysis.AnalyzerWrapper.initReader(AnalyzerWrapper.java:156)
        at org.apache.lucene.analysis.Analyzer.tokenStream(Analyzer.java:197)
        at org.apache.lucene.document.Field.tokenStream(Field.java:513)
        at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:915)
        at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:524)
        at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:488)
        at org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments(DocumentsWriterPerThread.java:208)
        at org.apache.lucene.index.DocumentsWriter.updateDocuments(DocumentsWriter.java:419)
        at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1471)
        at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757)
        at org.apache.solr.update.DirectUpdateHandler2.updateDocOrDocValues(DirectUpdateHandler2.java:983)
        at org.apache.solr.update.DirectUpdateHandler2.doNormalUpdate(DirectUpdateHandler2.java:347)
        at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:294)
        at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:241)
        ... 56 more

Is this by design so I have to add at least an empty/space String element ?
Thanks for all.

That's a bug, thank you for catching it :-) 🐞

Thanks to you for your quick response. Have a nice day