Experimentation with Python PDF files manipulation

Playing around with Python PDF handlers to see what is their outputs and their performance.

For detail about what each package can do, check the features folder.

Packages tested:

1. PyPDF2:

import PyPDF2

 with open("example.pdf", "rb") as file:
     pdf_reader = PyPDF2.PdfReader(file)
     text = ""
     for page_num in range(pdf_reader.numPages):
         text += pdf_reader.getPage(page_num).extractText()

2. PyMuPDF (MuPDF):

import fitz

 pdf_document = fitz.open("example.pdf")
 text = ""
 for page_num in range(pdf_document.page_count):
     page = pdf_document[page_num]
     text += page.get_text()   

3. Tabula-py:

import tabula

tables = tabula.read_pdf("example.pdf", pages="all")


4. Camelot-py:

 import camelot

 tables = camelot.read_pdf("example.pdf", flavor="stream", pages="all")

5. pdfminer.six:

from pdfminer.high_level import extract_text

text = extract_text(open("example.pdf", "rb"))


6. PDFplumber:

 import pdfplumber

 with pdfplumber.open("example.pdf") as pdf:
     text = ""
     for page in pdf.pages:
         text += page.extract_text()

7. Slate:

 from slate import PDF

 with open("example.pdf", "rb") as file:
     pdf = PDF(file)
     text = pdf.text()

8. Slate (slate3k):

from slate3k import PDF

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf = PDF(file)
        text = ""
        for page in pdf:
            text += page.get_text()
    return text


9. PDFQuery:

 from pdfquery import PDFQuery

 def extract_text_from_pdfquery(file_path):
     pdf = PDFQuery(file_path)
     # Example: Extracting text from the first page
     text = pdf.extract([
         ('with_formatter', 'text'),
         ('text', 'LTPage[pageid="1"]'),
     return text

10. PyPDFium:

 from pypdfium import PdfDocument

 def extract_text_with_pypdfium(file_path):
     with open(file_path, 'rb') as file:
         pdf_doc = PdfDocument(file)
         text = ''
         for page_num in range(pdf_doc.get_page_count()):
             page = pdf_doc.get_page(page_num)
             text += page.get_text()
     return text