from PIL import Image, UnidentifiedImageError import io import os def extract_images(prefix, byte_data): # JPEG and PNG start and end markers jpeg_start_marker = b'\xFF\xD8' jpeg_end_marker = b'\xFF\xD9' png_start_marker = b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A' png_end_marker = b'IEND\xAE\x42\x60\x82' images = [] start = 0 while start < len(byte_data): jpeg_start = byte_data.find(jpeg_start_marker, start) png_start = byte_data.find(png_start_marker, start) # Determine the next image type (JPEG or PNG) and its start position if (jpeg_start != -1 and (jpeg_start < png_start or png_start == -1)): # Processing JPEG image jpeg_end = byte_data.find(jpeg_end_marker, jpeg_start) + len(jpeg_end_marker) if jpeg_end == -1: break # No valid end found, likely corrupted image_data = byte_data[jpeg_start:jpeg_end] start = jpeg_end # Move start to the end of the current image elif (png_start != -1 and (png_start < jpeg_start or jpeg_start == -1)): # Processing PNG image png_end = byte_data.find(png_end_marker, png_start) + len(png_end_marker) if png_end == -1: break # No valid end found, likely corrupted image_data = byte_data[png_start:png_end] start = png_end # Move start to the end of the current image else: break # No more images found try: # Load the image using PIL to verify it's valid image = Image.open(io.BytesIO(image_data)) image.verify() # Verify that it is indeed an image images.append(image_data) except (IOError, UnidentifiedImageError): # Skip this image if it cannot be identified print(f"Skipping invalid image data at position {start}") continue # Process all extracted images for idx, image_data in enumerate(images): # Reload the image (since `verify()` puts the file in an unusable state) image = Image.open(io.BytesIO(image_data)) # Save the image (you can also display it using image.show()) extension = 'jpg' if image.format == 'JPEG' else 'png' image.save(f"./output/{prefix}_image_{idx+1}.{extension}") print(f"Extracted {image.format} image {idx+1}") files = os.listdir("./input/") for file in files: filename, _ = os.path.splitext(file) with open(f"./input/{file}", "rb") as file: extract_images(filename, file.read())