ArtifexSoftware/pdf2docx

alignment of words are sacttered when the images in it

ShyamBasa opened this issue · 0 comments

Description of the bug

words only pdf files are converting as expected, but when it comes to images with some words inside the pdf file is giving the issue, sample file is attached for your reference.
test_conversion.pdf

How to reproduce the bug

from flask import Flask, request, jsonify
from pdf2docx import Converter
import os
from flask import send_file
import logging
from flask_cors import CORS

logging.basicConfig(level=logging.DEBUG)

app = Flask(name)
CORS(app)

@app.route('/convert_pdf_to_docx', methods=['POST'])
def convert_pdf_to_docx():
pass
if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400

file = request.files['file']

if file.filename == '':
    return jsonify({"error": "No selected file"}), 400

try:

    # Save the uploaded PDF to a temporary file
    pdf_path = 'C:\\Users\\user\\Downloads\\python\\test\\temp.pdf'
    file.save(pdf_path)

    # Convert PDF to DOCX
    docx_path = 'C:\\Users\\user\\Downloads\\python\\test\\output.docx'
    cv = Converter(pdf_path)
    cv.convert(docx_path, start=0, end=None)
    
    cv.close()
    
    # Return the DOCX file
    #return jsonify({"message": "Conversion successful", "docx_file": docx_path})
    return send_file(docx_path, as_attachment=True)

except Exception as e:
    return jsonify({"error": str(e)}), 500

finally:
    # Clean up temporary PDF file
    if os.path.exists(pdf_path):
        os.remove(pdf_path)

if name == 'main':
app.run(host='192.168.200.5', port=5000)
app.run(debug=True)

pdf2docx version

0.5.3

Operating system

Windows

Python version

3.10