Skip to content

False color imaging #2953

@YikunHan42

Description

@YikunHan42

Hi Jorj, PyMuPDF is an excellent tool!

I am trying to extract images from scientific literature PDFs, but some of the images are not color-mapped correctly. Do you have any ideas or possible solutions? Thank you very much for your help in advance!

Example with correct colors:
10 1002+adma 202105410_page_4_img_1

Example with wrong colors:
10 1002+adma 202105410_page_3_img_1

Here is the original code:

import fitz  # PyMuPDF
import os

def extract_images_from_pdf(pdf_path, output_folder):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    doc = fitz.open(pdf_path)

    for i in range(len(doc)):
        page = doc.load_page(i)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image_filename = f"{pdf_name}_page_{i+1}_img_{image_index+1}.png"
            image_output_path = os.path.join(output_folder, image_filename)

            with open(image_output_path, "wb") as f:
                f.write(image_bytes)

            print(f"Image saved to {image_output_path}")

    doc.close()

def extract_images_from_folder(pdf_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            extract_images_from_pdf(pdf_path, output_folder)

# Example usage
pdf_folder = "./pdf"  # Folder containing PDFs
output_folder = "./output"  # Output folder for all images
extract_images_from_folder(pdf_folder, output_folder)

Metadata

Metadata

Assignees

No one assigned

    Labels

    not a bugnot a bug / user error / unable to reproduce

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions