-
Notifications
You must be signed in to change notification settings - Fork 678
Closed
Labels
not a bugnot a bug / user error / unable to reproducenot a bug / user error / unable to reproduce
Description
Hi Jorj, PyMuPDF is an excellent tool!
I am trying to extract images from scientific literature PDFs, but some of the images are not color-mapped correctly. Do you have any ideas or possible solutions? Thank you very much for your help in advance!
Here is the original code:
import fitz # PyMuPDF
import os
def extract_images_from_pdf(pdf_path, output_folder):
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
doc = fitz.open(pdf_path)
for i in range(len(doc)):
page = doc.load_page(i)
image_list = page.get_images(full=True)
for image_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_filename = f"{pdf_name}_page_{i+1}_img_{image_index+1}.png"
image_output_path = os.path.join(output_folder, image_filename)
with open(image_output_path, "wb") as f:
f.write(image_bytes)
print(f"Image saved to {image_output_path}")
doc.close()
def extract_images_from_folder(pdf_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(pdf_folder):
if filename.lower().endswith(".pdf"):
pdf_path = os.path.join(pdf_folder, filename)
extract_images_from_pdf(pdf_path, output_folder)
# Example usage
pdf_folder = "./pdf" # Folder containing PDFs
output_folder = "./output" # Output folder for all images
extract_images_from_folder(pdf_folder, output_folder)Metadata
Metadata
Assignees
Labels
not a bugnot a bug / user error / unable to reproducenot a bug / user error / unable to reproduce

