import fitz # fitz就是pip install PyMuPDF import uuid import os import argparse from tqdm import tqdm def findfiles(path): result = [] # 首先遍历当前目录所有文件及文件夹 file_list = os.listdir(path) # 循环判断每个元素是否是文件夹还是文件,是文件夹的话,递归 for file in file_list: # 利用os.path.join()方法取得路径全名,并存入cur_path变量,否则每次只能遍历一层目录 cur_path = os.path.join(path, file) # 判断是否是文件夹 if os.path.isdir(cur_path): findfiles(cur_path) else: # 判断是否是特定文件名称 if 'pdf' in file: # print(cur_path) result.append(cur_path) return result def pyMuPDF_fitz(pdfPath, imagePath): global count try: pdfDoc = fitz.open(pdfPath) cnt = 1 for pg in range(pdfDoc.page_count): page = pdfDoc[pg] info = page.bound() radio = 0.0 if info.width > info.height: radio = 800.0 / info.width else: radio = 800.0 / info.height rotate = int(0) zoom_x = 1.33333 zoom_y = 1.333333 mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate) try: pix = page.get_pixmap(matrix=mat, alpha=False) except Exception as e: print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e) continue if not os.path.exists(imagePath): os.makedirs(imagePath) file_name = os.path.splitext(os.path.basename(pdfPath))[0] save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(cnt) + '.jpg') pix.save(save_path) count += 1 cnt += 1 except Exception as e: print(f"发生错误: {e}") pdfDoc.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--pdf_path', type=str, default='') parser.add_argument('--pdf_dir', type=str, default='') parser.add_argument('--save_dir', type=str, default='./images') args = parser.parse_args() count = 0 if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.pdf_path == '': result = findfiles(args.pdf_dir) file_count = len(result) for item in tqdm(result): pyMuPDF_fitz(item, args.save_dir) else: pyMuPDF_fitz(args.pdf_path, args.save_dir)