VikParuchuri/marker

Feature request: URL extraction

ShakirAkbari opened this issue · 1 comments

Requesting an additional feature when extracting information from PDFs.

Can you please add the ability to extract URLs from the document?

I wrote this code to pull link text and links out of pdfs. Maybe you can incorporate part of this into your code base with an option to enable extract_links_from_pdf in the settings:

from gc import get_objects
from pypdf import PdfReader
from pypdf.annotations import Link
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal

def parse_pdf(filename):
reader = PdfReader(filename)
pagenumber = 0
while pagenumber < len(reader.pages):
print(f'\n{reader.pages[pagenumber].extract_text()}')
pagenumber = pagenumber + 1

def extract_links_from_pdf(pdf_path):
links = []

with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    
    for page_num, page in enumerate(pdf_reader.pages):
        page_links = []
        if '/Annots' in page:
            annotations = page['/Annots']
            
            for annotation in annotations:
                annotation_object = annotation.get_object()
                
                if annotation_object['/Subtype'] == '/Link':
                    if '/A' in annotation_object:
                        action = annotation_object['/A']
                        if '/URI' in action:
                            uri = action['/URI']
                            
                            if '/Rect' in annotation_object:
                                rect = annotation_object['/Rect']
                                x1, y1, x2, y2 = [float(coord) for coord in rect]
                                
                                page_links.append((uri, x1, y1, x2, y2))
        
        if page_links:
            page_text = extract_text_with_positions(pdf_path, page_num)
            links.extend(associate_text_with_links(page_text, page_links))

return merge_adjacent_links(links)

def extract_text_with_positions(pdf_path, page_num):
text_with_positions = []
for page_layout in extract_pages(pdf_path, page_numbers=[page_num]):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
if isinstance(text_line, LTTextLineHorizontal):
for character in text_line:
if isinstance(character, LTChar):
text_with_positions.append((character.get_text(), character.x0, character.y1))
return text_with_positions

def associate_text_with_links(page_text, page_links):
associated_links = []
for uri, x1, y1, x2, y2 in page_links:
link_text = []
for char, char_x, char_y in page_text:
if x1 <= char_x <= x2 and y1 <= char_y <= y2:
link_text.append(char)
associated_links.append((uri, ''.join(link_text), y1))
return associated_links

def merge_adjacent_links(links):
merged_links = []
current_url = None
current_text = ""

for url, text, _ in sorted(links, key=lambda x: (-x[2], x[0])):
    if url == current_url:
        current_text += " " + text
    else:
        if current_url:
            merged_links.append((current_url, current_text.strip()))
        current_url = url
        current_text = text

if current_url:
    merged_links.append((current_url, current_text.strip()))

return merged_links

if name == 'main':
pdf_path = 'data/sample.pdf'
parse_pdf(pdf_path)

print("\n---")
extracted_links = extract_links_from_pdf(pdf_path)
for link, text in extracted_links:
    print(f"Link Text: {text}")
    print(f"Link: {link}")
    print("---")