pdf.load() in pdfquery.py - 'dict' object has no attribute 'resolve'
dannguyen opened this issue · 2 comments
dannguyen commented
Using Python version 3.5 (w/ Anaconda 2.4.0); sorry I don't have much more to add than a bug report. I've just been looking for something in Python 3.x to convert a PDF into text and preserving its layout (a la pdftotext from poppler)...so pdfquery is probably beyond my plaintext needs. But figured you'd be interested in knowing.
Reproducible code:
curl \
https://static.googleusercontent.com/media/www.google.com/en//selfdrivingcar/files/reports/report-0515.pdf \
-o g.pdf
import pdfquery
pdf = pdfquery.PDFQuery("g.pdf")
pdf.load()
AttributeError Traceback (most recent call last)
<ipython-input-3-4357470f507b> in <module>()
----> 1 pdf.load()
/Users/dtown/.pyenv/versions/anaconda3-2.4.0/lib/python3.5/site-packages/pdfquery/pdfquery.py in load(self, *page_numbers)
381 [<LTPage>, <LTPage>]
382 """
--> 383 self.tree = self.get_tree(*_flatten(page_numbers))
384 self.pq = self.get_pyquery(self.tree)
385
/Users/dtown/.pyenv/versions/anaconda3-2.4.0/lib/python3.5/site-packages/pdfquery/pdfquery.py in get_tree(self, *page_numbers)
483 else:
484 pages = enumerate(self.get_layouts())
--> 485 for n, page in pages:
486 page = self._xmlize(page)
487 page.set('page_index', obj_to_string(n))
/Users/dtown/.pyenv/versions/anaconda3-2.4.0/lib/python3.5/site-packages/pdfquery/pdfquery.py in <genexpr>(.0)
604 def get_layouts(self):
605 """ Get list of PDFMiner Layout objects for each page. """
--> 606 return (self.get_layout(page) for page in self._cached_pages())
607
608 def _cached_pages(self, target_page=-1):
/Users/dtown/.pyenv/versions/anaconda3-2.4.0/lib/python3.5/site-packages/pdfquery/pdfquery.py in get_layout(self, page)
599 self.interpreter.process_page(page)
600 layout = self.device.get_result()
--> 601 layout = self._add_annots(layout, page.annots)
602 return layout
603
/Users/dtown/.pyenv/versions/anaconda3-2.4.0/lib/python3.5/site-packages/pdfquery/pdfquery.py in _add_annots(self, layout, annots)
642 annots = annots.resolve()
643 for annot in annots:
--> 644 annot = annot.resolve()
645 if annot.get('Rect') is not None:
646 annot['bbox'] = annot.pop('Rect') # Rename key
AttributeError: 'dict' object has no attribute 'resolve'
rawrxiv commented
How's it going, getting the same issue but am not able to a provide a pdf source:
Traceback (most recent call last):
File "<ipython-input-3-ec479cbcfae6>", line 1, in <module>
runfile('D:/Tools/Scripts/AOWScraper.py', wdir='D:/Tools/Scripts')
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 807, in runfile
execfile(filename, namespace)
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 95, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "D:/Tools/Scripts/AOWScraper.py", line 11, in <module>
pdf.load()
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\pdfquery\pdfquery.py", line 383, in load
self.tree = self.get_tree(*_flatten(page_numbers))
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\pdfquery\pdfquery.py", line 485, in get_tree
for n, page in pages:
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\pdfquery\pdfquery.py", line 606, in <genexpr>
return (self.get_layout(page) for page in self._cached_pages())
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\pdfquery\pdfquery.py", line 601, in get_layout
layout = self._add_annots(layout, page.annots)
File "D:\Utilities\WinPython-64bit-3.4.3.7\python-3.4.3.amd64\lib\site-packages\pdfquery\pdfquery.py", line 644, in _add_annots
annot = annot.resolve()
AttributeError: 'dict' object has no attribute 'resolve'
jcushman commented
This is fixed in v.0.4.2 on PyPI. Thanks!