import argparse import os import shutil import fitz import uuid import random import json json_data = [] def is_in_whitelist(file_name, whitelist): # 获取文件的后缀名 _, ext = os.path.splitext(file_name) # 判断后缀名是否在白名单中 return ext.lower() in whitelist def generate_unique_random_numbers(start, end, count): numbers = set() while len(numbers) < count: numbers.add(random.randint(start, end-1)) return list(numbers)[:count] def get_render_pages(count, need): if count < 3: return list(range(count)) else: return generate_unique_random_numbers(0, count, need) def pyMuPDF_fitz(pdfPath, imagePath, need): try: pdfDoc = fitz.open(pdfPath) file_name = os.path.splitext(os.path.basename(pdfPath))[0] page_array = [] if need == 0: page_array = list(range(pdfDoc.page_count)) elif need >= pdfDoc.page_count: page_array = list(range(pdfDoc.page_count)) else: page_array = get_render_pages(pdfDoc.page_count, need) file_info = { "file_name" : file_name, "page_numbers" : page_array } json_data.append(file_info) for pg in page_array: page = pdfDoc[pg] info = page.bound() radio = 0.0 if info.width > info.height: radio = 800.0 / info.width else: radio = 800.0 / info.height rotate = int(0) zoom_x = 1.33333 zoom_y = 1.333333 mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate) try: pix = page.get_pixmap(matrix=mat, alpha=False) except Exception as e: print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e) continue if not os.path.exists(imagePath): os.makedirs(imagePath) save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg') pix.save(save_path) except Exception as e: print(f"发生错误: {e}") pdfDoc.close() def process_files(input, output, need): # 遍历目录中的文件和文件夹 for root, dirs, files in os.walk(input): # 只输出文件名,不包括文件夹 for file in files: if not file.startswith(".") and is_in_whitelist(file, [".pdf"]): file_path = os.path.join(root, file) pyMuPDF_fitz(file_path, output, need) def main(): parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n") parse.add_argument("input", help="输入路径") parse.add_argument("output", help="输出路径") parse.add_argument("--count", type=int, default=0, help="每份PDF需要提取的图片数量") args = parse.parse_args() process_files(args.input, os.path.join(args.output, "images"), args.count) json_path = os.path.join(args.output, "files_info.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_data, f, ensure_ascii=False, indent=4) if __name__ == "__main__": main()