Others
/
DocumentAIKit


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							import argparse
import os
import shutil
import fitz
import uuid
import random
import json


json_data = []


def is_in_whitelist(file_name, whitelist):
    # 获取文件的后缀名
    _, ext = os.path.splitext(file_name)
    # 判断后缀名是否在白名单中
    return ext.lower() in whitelist


def generate_unique_random_numbers(start, end, count):
    numbers = set()
    while len(numbers) < count:
        numbers.add(random.randint(start, end-1))
    return list(numbers)[:count]


def get_render_pages(count, need):
    if count < 3:
        return list(range(count))
    else:
        return generate_unique_random_numbers(0, count, need)


def pyMuPDF_fitz(pdfPath, imagePath, need):
    try:
        pdfDoc = fitz.open(pdfPath)

        file_name = os.path.splitext(os.path.basename(pdfPath))[0]

        page_array = []
        if need == 0:
            page_array = list(range(pdfDoc.page_count))
        elif need >= pdfDoc.page_count:
            page_array = list(range(pdfDoc.page_count))
        else:
            page_array = get_render_pages(pdfDoc.page_count, need)

        file_info = {
            "file_name" : file_name,
            "page_numbers" : page_array
        }
        json_data.append(file_info)

        for pg in page_array:
            page = pdfDoc[pg]
            info = page.bound()
            radio = 0.0
            if info.width > info.height:
                radio = 800.0 / info.width
            else:
                radio = 800.0 / info.height
            rotate = int(0)
            zoom_x = 1.33333
            zoom_y = 1.333333
            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
            try:
                pix = page.get_pixmap(matrix=mat, alpha=False)
            except Exception as e:
                print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
                continue
            if not os.path.exists(imagePath):
                os.makedirs(imagePath)

            save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
            pix.save(save_path)

    except Exception as e:
        print(f"发生错误: {e}")

    pdfDoc.close()


def process_files(input, output, need):
    # 遍历目录中的文件和文件夹
    for root, dirs, files in os.walk(input):
        # 只输出文件名，不包括文件夹
        for file in files:
            if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
                file_path = os.path.join(root, file)
                pyMuPDF_fitz(file_path, output, need)


def main():
    parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
    parse.add_argument("input", help="输入路径")
    parse.add_argument("output", help="输出路径")
    parse.add_argument("--count", type=int, default=0, help="每份PDF需要提取的图片数量")

    args = parse.parse_args()
    process_files(args.input, os.path.join(args.output, "images"), args.count)

    json_path = os.path.join(args.output, "files_info.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=4)


if __name__ == "__main__":
    main()