|
@@ -0,0 +1,101 @@
|
|
|
+import argparse
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import fitz
|
|
|
+import uuid
|
|
|
+import random
|
|
|
+import json
|
|
|
+
|
|
|
+
|
|
|
+json_data = []
|
|
|
+
|
|
|
+
|
|
|
+def is_in_whitelist(file_name, whitelist):
|
|
|
+ # 获取文件的后缀名
|
|
|
+ _, ext = os.path.splitext(file_name)
|
|
|
+ # 判断后缀名是否在白名单中
|
|
|
+ return ext.lower() in whitelist
|
|
|
+
|
|
|
+
|
|
|
+def generate_unique_random_numbers(start, end, count):
|
|
|
+ numbers = set()
|
|
|
+ while len(numbers) < count:
|
|
|
+ numbers.add(random.randint(start, end-1))
|
|
|
+ return list(numbers)[:count]
|
|
|
+
|
|
|
+
|
|
|
+def get_render_pages(count, need):
|
|
|
+ if count < 3:
|
|
|
+ return list(range(count))
|
|
|
+ else:
|
|
|
+ return generate_unique_random_numbers(0, count, need)
|
|
|
+
|
|
|
+
|
|
|
+def pyMuPDF_fitz(pdfPath, imagePath, need):
|
|
|
+ try:
|
|
|
+ pdfDoc = fitz.open(pdfPath)
|
|
|
+
|
|
|
+ file_name = os.path.splitext(os.path.basename(pdfPath))[0]
|
|
|
+ page_array = get_render_pages(pdfDoc.page_count, need)
|
|
|
+
|
|
|
+ file_info = {
|
|
|
+ "file_name" : file_name,
|
|
|
+ "page_numbers" : page_array
|
|
|
+ }
|
|
|
+ json_data.append(file_info)
|
|
|
+
|
|
|
+ for pg in page_array:
|
|
|
+ page = pdfDoc[pg]
|
|
|
+ info = page.bound()
|
|
|
+ radio = 0.0
|
|
|
+ if info.width > info.height:
|
|
|
+ radio = 800.0 / info.width
|
|
|
+ else:
|
|
|
+ radio = 800.0 / info.height
|
|
|
+ rotate = int(0)
|
|
|
+ zoom_x = 1.33333
|
|
|
+ zoom_y = 1.333333
|
|
|
+ mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
|
|
|
+ try:
|
|
|
+ pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
+ except Exception as e:
|
|
|
+ print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
|
|
|
+ continue
|
|
|
+ if not os.path.exists(imagePath):
|
|
|
+ os.makedirs(imagePath)
|
|
|
+
|
|
|
+ save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
|
|
|
+ pix.save(save_path)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"发生错误: {e}")
|
|
|
+
|
|
|
+ pdfDoc.close()
|
|
|
+
|
|
|
+
|
|
|
+def process_files(input, output, need):
|
|
|
+ # 遍历目录中的文件和文件夹
|
|
|
+ for root, dirs, files in os.walk(input):
|
|
|
+ # 只输出文件名,不包括文件夹
|
|
|
+ for file in files:
|
|
|
+ if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
|
|
|
+ file_path = os.path.join(root, file)
|
|
|
+ pyMuPDF_fitz(file_path, output, need)
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
|
|
|
+ parse.add_argument("input", help="输入路径")
|
|
|
+ parse.add_argument("output", help="输出路径")
|
|
|
+ parse.add_argument("--count", type=int, help="每份PDF需要提取的图片数量")
|
|
|
+
|
|
|
+ args = parse.parse_args()
|
|
|
+ process_files(args.input, os.path.join(args.output, "images"), args.count)
|
|
|
+
|
|
|
+ json_path = os.path.join(args.output, "files_info.json")
|
|
|
+ with open(json_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(json_data, f, ensure_ascii=False, indent=4)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|