123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- import argparse
- import os
- import shutil
- import fitz
- import uuid
- import random
- import json
- json_data = []
- def is_in_whitelist(file_name, whitelist):
- # 获取文件的后缀名
- _, ext = os.path.splitext(file_name)
- # 判断后缀名是否在白名单中
- return ext.lower() in whitelist
- def generate_unique_random_numbers(start, end, count):
- numbers = set()
- while len(numbers) < count:
- numbers.add(random.randint(start, end-1))
- return list(numbers)[:count]
- def get_render_pages(count, need):
- if count < 3:
- return list(range(count))
- else:
- return generate_unique_random_numbers(0, count, need)
- def pyMuPDF_fitz(pdfPath, imagePath, need):
- try:
- pdfDoc = fitz.open(pdfPath)
- file_name = os.path.splitext(os.path.basename(pdfPath))[0]
- page_array = []
- if need == 0:
- page_array = list(range(pdfDoc.page_count))
- elif need >= pdfDoc.page_count:
- page_array = list(range(pdfDoc.page_count))
- else:
- page_array = get_render_pages(pdfDoc.page_count, need)
- file_info = {
- "file_name" : file_name,
- "page_numbers" : page_array
- }
- json_data.append(file_info)
- for pg in page_array:
- page = pdfDoc[pg]
- info = page.bound()
- radio = 0.0
- if info.width > info.height:
- radio = 800.0 / info.width
- else:
- radio = 800.0 / info.height
- rotate = int(0)
- zoom_x = 1.33333
- zoom_y = 1.333333
- mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
- try:
- pix = page.get_pixmap(matrix=mat, alpha=False)
- except Exception as e:
- print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
- continue
- if not os.path.exists(imagePath):
- os.makedirs(imagePath)
- save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
- pix.save(save_path)
- except Exception as e:
- print(f"发生错误: {e}")
- pdfDoc.close()
- def process_files(input, output, need):
- # 遍历目录中的文件和文件夹
- for root, dirs, files in os.walk(input):
- # 只输出文件名,不包括文件夹
- for file in files:
- if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
- file_path = os.path.join(root, file)
- pyMuPDF_fitz(file_path, output, need)
- def main():
- parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
- parse.add_argument("input", help="输入路径")
- parse.add_argument("output", help="输出路径")
- parse.add_argument("--count", type=int, default=0, help="每份PDF需要提取的图片数量")
- args = parse.parse_args()
- process_files(args.input, os.path.join(args.output, "images"), args.count)
- json_path = os.path.join(args.output, "files_info.json")
- with open(json_path, 'w', encoding='utf-8') as f:
- json.dump(json_data, f, ensure_ascii=False, indent=4)
- if __name__ == "__main__":
- main()
|