images_fetching.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import argparse
  2. import os
  3. import shutil
  4. import fitz
  5. import uuid
  6. import random
  7. import json
  8. json_data = []
  9. def is_in_whitelist(file_name, whitelist):
  10. # 获取文件的后缀名
  11. _, ext = os.path.splitext(file_name)
  12. # 判断后缀名是否在白名单中
  13. return ext.lower() in whitelist
  14. def generate_unique_random_numbers(start, end, count):
  15. numbers = set()
  16. while len(numbers) < count:
  17. numbers.add(random.randint(start, end-1))
  18. return list(numbers)[:count]
  19. def get_render_pages(count, need):
  20. if count < 3:
  21. return list(range(count))
  22. else:
  23. return generate_unique_random_numbers(0, count, need)
  24. def pyMuPDF_fitz(pdfPath, imagePath, need):
  25. try:
  26. pdfDoc = fitz.open(pdfPath)
  27. file_name = os.path.splitext(os.path.basename(pdfPath))[0]
  28. page_array = []
  29. if need == 0:
  30. page_array = list(range(pdfDoc.page_count))
  31. elif need >= pdfDoc.page_count:
  32. page_array = list(range(pdfDoc.page_count))
  33. else:
  34. page_array = get_render_pages(pdfDoc.page_count, need)
  35. file_info = {
  36. "file_name" : file_name,
  37. "page_numbers" : page_array
  38. }
  39. json_data.append(file_info)
  40. for pg in page_array:
  41. page = pdfDoc[pg]
  42. info = page.bound()
  43. radio = 0.0
  44. if info.width > info.height:
  45. radio = 800.0 / info.width
  46. else:
  47. radio = 800.0 / info.height
  48. rotate = int(0)
  49. zoom_x = 1.33333
  50. zoom_y = 1.333333
  51. mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
  52. try:
  53. pix = page.get_pixmap(matrix=mat, alpha=False)
  54. except Exception as e:
  55. print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
  56. continue
  57. if not os.path.exists(imagePath):
  58. os.makedirs(imagePath)
  59. save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(pg) + '.jpg')
  60. pix.save(save_path)
  61. except Exception as e:
  62. print(f"发生错误: {e}")
  63. pdfDoc.close()
  64. def process_files(input, output, need):
  65. # 遍历目录中的文件和文件夹
  66. for root, dirs, files in os.walk(input):
  67. # 只输出文件名,不包括文件夹
  68. for file in files:
  69. if not file.startswith(".") and is_in_whitelist(file, [".pdf"]):
  70. file_path = os.path.join(root, file)
  71. pyMuPDF_fitz(file_path, output, need)
  72. def main():
  73. parse = argparse.ArgumentParser("从PDF当中随机抽取图片\n")
  74. parse.add_argument("input", help="输入路径")
  75. parse.add_argument("output", help="输出路径")
  76. parse.add_argument("--count", type=int, default=0, help="每份PDF需要提取的图片数量")
  77. args = parse.parse_args()
  78. process_files(args.input, os.path.join(args.output, "images"), args.count)
  79. json_path = os.path.join(args.output, "files_info.json")
  80. with open(json_path, 'w', encoding='utf-8') as f:
  81. json.dump(json_data, f, ensure_ascii=False, indent=4)
  82. if __name__ == "__main__":
  83. main()