ALL BUSINESS ENGLISH ARTICLES PYTHON

Read and extract text from a PDF file

This is the PDF file. Download “demo_ten_page_document_with_paragraphs.pdf

This is the extracted text from the PDF file. Download “1.docx

Here is the python code to read and extract text from a PDF file:

import tkinter as tk

import pdfplumber

from PIL import ImageImageTk

from tkinter.filedialog import askopenfile

from docx import Document

from docx.shared import Inches

import os

 

root = tk.Tk()

 

canvas = tk.Canvas(rootwidth=600height=200)

canvas.grid(columnspan=3rowspan=3)

 

# logo

try:

    logo = Image.open(‘logo.png’)

    logo = ImageTk.PhotoImage(logo)

    logo_label = tk.Label(image=logo)

    logo_label.image = logo

    logo_label.grid(column=1row=0)

except FileNotFoundError:

    print(“Logo file not found. Please ensure logo.png is in the same directory as the script.”)

 

# instructions

instructions = tk.Label(roottext=“Select a PDF file on your computer to extract all its text / Διαλέξτε ένα αρχείο PDF από τον υπολογιστή σας για να γίνει εξαγωγή κειμένουfont=“Calibri”)

instructions.grid(columnspan=3column=0row=1)

 

instructions2 = tk.Label(roottext=“Select 1.docx file on your computer to use all extracted text / Διαλέξτε το αρχείο 1.docx από τον υπολογιστή σας για να χρησιμοποιήσετε το εξαχθέν κείμενοfont=“Calibri”)

instructions2.grid(columnspan=3column=0row=6)

 

# instructions3

instructions3 = tk.Label(roottext=“Version 1.0 | Created by Tryfon Papadopoulos”font=“Calibri”)

instructions3.grid(columnspan=3column=0row=7)

 

def open_file():

    browse_text.set(“loading…”)

    file = askopenfile(parent=rootmode=‘rb’title=“Choose a file / Διαλέξτε ένα αρχείοfiletypes=[(“Pdf file”“*.pdf”)])

    if file:

        with pdfplumber.open(fileas pdf:

            page_content = “”

            has_text = False

            for page in pdf.pages:

                text = page.extract_text()

                if text:  # Ensure the text is not None

                    page_content += text + \n

                    has_text = True

                    print(f“Extracted text from page {page.page_number}{text})  # Debug: print extracted text

                else:

                    print(f“No text on page {page.page_number}, saving as image.”)  # Debug: print no text message

 

            print(“Final extracted content:\npage_content)  # Debug: print all extracted content

 

            # Create a new Word document

            doc = Document()

 

            if has_text:

                doc.add_paragraph(page_content)

            else:

                for page_numpage in enumerate(pdf.pages):

                    if not page.extract_text():

                        img = page.to_image(resolution=300)

                        img_path = f“page_{page_num + 1}.png”

                        img.save(img_path)

                        doc.add_picture(img_pathwidth=Inches(6))

                        os.remove(img_path)

 

            # Save the document as “1.docx”

            doc.save(“1.docx”)

            print(“Content saved to 1.docx”)  # Debug: confirm saving to Word file

 

            # text box

            text_box = tk.Text(rootheight=13width=70padx=15pady=15)

            if has_text:

                text_box.insert(1.0page_content)

            else:

                text_box.insert(1.0“No text found in PDF. Images saved to Word document.”)

            text_box.tag_configure(“center”justify=“center”)

            text_box.tag_add(“center”1.0“end”)

            text_box.grid(column=1row=3)

 

    browse_text.set(“Browse file / Επιλογή αρχείου)

 

# browse button

browse_text = tk.StringVar()

browse_btn = tk.Button(roottextvariable=browse_textcommand=lambdaopen_file(), font=“Calibri”bg=“#20bebe”fg=“white”height=2width=25)

browse_text.set(“Browse file / Επιλογή αρχείου)

browse_btn.grid(column=1row=2)

 

canvas = tk.Canvas(rootwidth=600height=150)

canvas.grid(columnspan=3)

 

root.mainloop()

 

Views: 6

Comments are closed.

Pin It