images_fetching.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import argparse
  2. import os
  3. import shutil
  4. import fitz
  5. import uuid
  6. import random
  7. import json
  8. json_data = []
  9. def is_in_whitelist(file_name, whitelist):
  10. # 获取文件的后缀名
  11. _, ext = os.path.splitext(file_name)
  12. # 判断后缀名是否在白名单中
  13. return ext.lower() in whitelist
  14. def generate_unique_random_numbers(start, end, count):
  15. numbers = set()
  16. while len(numbers) < count:
  17. numbers.add(random.randint(start, end-1))
  18. return list(numbers)[:count]
  19. def get_render_pages(count, need):
  20. if count < 3:
  21. return list(range(count))
  22. else:
  23. return generate_unique_random_numbers(0, count, need)
  24. def pyMuPDF_fitz(pdfPath, imagePath, need):
  25. try:
  26. pdfDoc = fitz.open(pdfPath)
  27. file_name = os.path.splitext(os.path.basename(pdfPath))[0]
  28. page_array = get_render_pages(pdfDoc.page_count, need)
  29. file_info = {
  30. "file_name" : file_name,
  31. "page_numbers" : page_array
  32. }
  33. json_data.append(file_info)
  34. for pg in page_array:
  35. page = pdfDoc[pg]
  36. info = page.bound()
  37. radio = 0.0
  38. if info.width > info.height:
  39. radio = 800.0 / info.width
  40. else:
  41. radio = 800.0 / info.height
  42. rotate = int(0)
  43. zoom_x = 1.33333
  44. zoom_y = 1.333333
  45. mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
  46. try:
  47. pix = page.get_pixmap(matrix=mat, alpha=False)
  48. except Exception as e:
  49. print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
  50. continue
  51. if not os.path.exists(imagePath):
  52. os.makedirs(imagePath)
  53. save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
  54. pix.save(save_path)
  55. except Exception as e:
  56. print(f"发生错误: {e}")
  57. pdfDoc.close()
  58. def process_files(input, output, need):
  59. # 遍历目录中的文件和文件夹
  60. for root, dirs, files in os.walk(input):
  61. # 只输出文件名,不包括文件夹
  62. for file in files:
  63. if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
  64. file_path = os.path.join(root, file)
  65. pyMuPDF_fitz(file_path, output, need)
  66. def main():
  67. parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
  68. parse.add_argument("input", help="输入路径")
  69. parse.add_argument("output", help="输出路径")
  70. parse.add_argument("--count", type=int, help="每份PDF需要提取的图片数量")
  71. args = parse.parse_args()
  72. process_files(args.input, os.path.join(args.output, "images"), args.count)
  73. json_path = os.path.join(args.output, "files_info.json")
  74. with open(json_path, 'w', encoding='utf-8') as f:
  75. json.dump(json_data, f, ensure_ascii=False, indent=4)
  76. if __name__ == "__main__":
  77. main()