ann-mark does not like hyphenated words
Closed this issue · 3 comments
OCR of "Stefan Zweig: Joseph Fouché"
Lyon
or Lyon,
is known to version …-9
of hunspell-de-de,
but Konvent
is known by both …-7
and …-9
of hunspell-de-de.
But Kon-
+ vent
not.
jb@pers16:~/workspace/ocr-quality-check/ocrd-cor-asv-ann-mark>
► grep -A1 NOTF OCR-D-COR/OCR-D-COR_test-fouche10_5.xml
<pc:Label value="NOTFOUND" type="format"/>
<pc:Label value="{}" type="normalization"/>
--
<pc:TextEquiv conf="0.958216781616211" comments="NOTFOUND">
<pc:Unicode>Lyon,</pc:Unicode>
--
<pc:TextEquiv conf="0.926121063232422" comments="NOTFOUND">
<pc:Unicode>aufge-</pc:Unicode>
--
<pc:TextEquiv conf="0.927254638671875" comments="NOTFOUND">
<pc:Unicode>ei-</pc:Unicode>
--
<pc:TextEquiv conf="0.969852142333984" comments="NOTFOUND">
<pc:Unicode>ner</pc:Unicode>
--
<pc:TextEquiv conf="0.928169403076172" comments="NOTFOUND">
<pc:Unicode>sozia-</pc:Unicode>
--
<pc:TextEquiv conf="0.967808074951172" comments="NOTFOUND">
<pc:Unicode>le</pc:Unicode>
--
<pc:TextEquiv conf="0.904031524658203" comments="NOTFOUND">
<pc:Unicode>schattenscharf</pc:Unicode>
--
<pc:TextEquiv conf="0.924739379882813" comments="NOTFOUND">
<pc:Unicode>abge-</pc:Unicode>
--
<pc:TextEquiv conf="0.96701545715332" comments="NOTFOUND">
<pc:Unicode>stadt</pc:Unicode>
--
<pc:TextEquiv conf="0.927002487182617" comments="NOTFOUND">
<pc:Unicode>Hei-</pc:Unicode>
--
<pc:TextEquiv conf="0.965015869140625" comments="NOTFOUND">
<pc:Unicode>mat</pc:Unicode>
--
<pc:TextEquiv conf="0.856195068359375" comments="NOTFOUND">
<pc:Unicode>bürgerli-</pc:Unicode>
--
<pc:TextEquiv conf="0.966738510131836" comments="NOTFOUND">
<pc:Unicode>chen</pc:Unicode>
--
<pc:TextEquiv conf="0.925609970092773" comments="NOTFOUND">
<pc:Unicode>royalistisch</pc:Unicode>
--
<pc:TextEquiv conf="0.951864242553711" comments="NOTFOUND">
<pc:Unicode>daß</pc:Unicode>
--
<pc:TextEquiv conf="0.92312629699707" comments="NOTFOUND">
<pc:Unicode>allerblutigsten</pc:Unicode>
--
<pc:TextEquiv conf="0.918043670654297" comments="NOTFOUND">
<pc:Unicode>fanati-</pc:Unicode>
--
<pc:TextEquiv conf="0.921217956542969" comments="NOTFOUND">
<pc:Unicode>schesten</pc:Unicode>
--
<pc:TextEquiv conf="0.923025970458984" comments="NOTFOUND">
<pc:Unicode>jakobinischen</pc:Unicode>
--
<pc:TextEquiv conf="0.920231475830078" comments="NOTFOUND">
<pc:Unicode>Arbeitslo-</pc:Unicode>
--
<pc:TextEquiv conf="0.967727508544922" comments="NOTFOUND">
<pc:Unicode>sen,</pc:Unicode>
--
<pc:TextEquiv conf="0.959376907348633" comments="NOTFOUND">
<pc:Unicode>männer.</pc:Unicode>
--
<pc:TextEquiv conf="0.920437850952148" comments="NOTFOUND">
<pc:Unicode>Weltveränderer</pc:Unicode>
--
<pc:TextEquiv conf="0.921394348144531" comments="NOTFOUND">
<pc:Unicode>verbesserer</pc:Unicode>
--
<pc:TextEquiv conf="0.965120162963867" comments="NOTFOUND">
<pc:Unicode>Lyon</pc:Unicode>
--
<pc:TextEquiv conf="0.914665679931641" comments="NOTFOUND">
<pc:Unicode>Chalier,</pc:Unicode>
--
<pc:TextEquiv conf="0.928091278076172" comments="NOTFOUND">
<pc:Unicode>Revo-</pc:Unicode>
--
<pc:TextEquiv conf="0.911165771484375" comments="NOTFOUND">
<pc:Unicode>lution</pc:Unicode>
--
<pc:TextEquiv conf="0.913422241210938" comments="NOTFOUND">
<pc:Unicode>selbstaufopfern-</pc:Unicode>
--
<pc:TextEquiv conf="0.908866271972656" comments="NOTFOUND">
<pc:Unicode>hebung</pc:Unicode>
--
<pc:TextEquiv conf="0.931270523071289" comments="NOTFOUND">
<pc:Unicode>pas-</pc:Unicode>
--
<pc:TextEquiv conf="0.917864608764648" comments="NOTFOUND">
<pc:Unicode>sionierten</pc:Unicode>
--
<pc:TextEquiv conf="0.911556854248047" comments="NOTFOUND">
<pc:Unicode>Rousseaus</pc:Unicode>
--
<pc:TextEquiv conf="0.968117065429687" comments="NOTFOUND">
<pc:Unicode>brand</pc:Unicode>
--
<pc:TextEquiv conf="0.931996383666992" comments="NOTFOUND">
<pc:Unicode>un-</pc:Unicode>
--
<pc:TextEquiv conf="0.924538726806641" comments="NOTFOUND">
<pc:Unicode>Zwingburg</pc:Unicode>
--
<pc:TextEquiv conf="0.960306091308594" comments="NOTFOUND">
<pc:Unicode>Lyon</pc:Unicode>
--
<pc:TextEquiv conf="0.93150032043457" comments="NOTFOUND">
<pc:Unicode>Marat,</pc:Unicode>
--
<pc:TextEquiv conf="0.922809448242187" comments="NOTFOUND">
<pc:Unicode>blutheißen,</pc:Unicode>
--
<pc:TextEquiv conf="0.927361907958984" comments="NOTFOUND">
<pc:Unicode>damp-</pc:Unicode>
--
<pc:TextEquiv conf="0.927261657714844" comments="NOTFOUND">
<pc:Unicode>fenden</pc:Unicode>
--
<pc:TextEquiv conf="0.930834045410156" comments="NOTFOUND">
<pc:Unicode>Re-</pc:Unicode>
--
<pc:TextEquiv conf="0.932336654663086" comments="NOTFOUND">
<pc:Unicode>ent-</pc:Unicode>
--
<pc:TextEquiv conf="0.916450500488281" comments="NOTFOUND">
<pc:Unicode>kin-</pc:Unicode>
--
<pc:TextEquiv conf="0.912195739746094" comments="NOTFOUND">
<pc:Unicode>dischen</pc:Unicode>
--
<pc:TextEquiv conf="0.960453414916992" comments="NOTFOUND">
<pc:Unicode>Lyon</pc:Unicode>
--
<pc:TextEquiv conf="0.932887878417969" comments="NOTFOUND">
<pc:Unicode>Lyon,</pc:Unicode>
--
<pc:TextEquiv conf="0.693235931396484" comments="NOTFOUND">
<pc:Unicode>daf3</pc:Unicode>
--
<pc:TextEquiv conf="0.918290634155273" comments="NOTFOUND">
<pc:Unicode>tollwütig</pc:Unicode>
--
<pc:TextEquiv conf="0.929761276245117" comments="NOTFOUND">
<pc:Unicode>beses-</pc:Unicode>
--
<pc:TextEquiv conf="0.924118423461914" comments="NOTFOUND">
<pc:Unicode>sener</pc:Unicode>
--
<pc:TextEquiv conf="0.902855606079102" comments="NOTFOUND">
<pc:Unicode>lärmendsten</pc:Unicode>
--
<pc:TextEquiv conf="0.92449333190918" comments="NOTFOUND">
<pc:Unicode>jakobinischen</pc:Unicode>
--
<pc:TextEquiv conf="0.923760833740234" comments="NOTFOUND">
<pc:Unicode>stifter.</pc:Unicode>
--
<pc:TextEquiv conf="0.950532608032227" comments="NOTFOUND">
<pc:Unicode>Haß.</pc:Unicode>
--
<pc:TextEquiv conf="0.927581176757812" comments="NOTFOUND">
<pc:Unicode>be-</pc:Unicode>
--
<pc:TextEquiv conf="0.927003326416016" comments="NOTFOUND">
<pc:Unicode>Rä-</pc:Unicode>
--
<pc:TextEquiv conf="0.922306213378906" comments="NOTFOUND">
<pc:Unicode>delsführer</pc:Unicode>
--
<pc:TextEquiv conf="0.918461761474609" comments="NOTFOUND">
<pc:Unicode>neurasthenischen</pc:Unicode>
--
<pc:TextEquiv conf="0.909296798706055" comments="NOTFOUND">
<pc:Unicode>klaubt</pc:Unicode>
--
<pc:TextEquiv conf="0.921045989990234" comments="NOTFOUND">
<pc:Unicode>ver-</pc:Unicode>
--
<pc:TextEquiv conf="0.913447113037109" comments="NOTFOUND">
<pc:Unicode>ge-</pc:Unicode>
--
<pc:TextEquiv conf="0.926153106689453" comments="NOTFOUND">
<pc:Unicode>Kon-</pc:Unicode>
--
<pc:TextEquiv conf="0.922394027709961" comments="NOTFOUND">
<pc:Unicode>vent</pc:Unicode>
--
<pc:TextEquiv conf="0.963757171630859" comments="NOTFOUND">
<pc:Unicode>Lyon,</pc:Unicode>
--
<pc:TextEquiv conf="0.929217224121094" comments="NOTFOUND">
<pc:Unicode>Chalier</pc:Unicode>
But
Kon-
+vent
not.
I'm afraid I don't know enough about hunspell. It seems like you need to create a dedicated hyphenation dictionary first. If that's true, then the processor should probably offer two distinct command
parameters – one for ordinary tokens, and one for those at the start/end of a line matching hyphenation rules...
Wouldn't "trying if the de-hyphenated words are found" be an option?
Wouldn't "trying if the de-hyphenated words are found" be an option?
Indeed. The processor now does dehyphenate if possible. (It also logs the mismatch rate per file and overall.)