pdf_to_image.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import fitz # fitz就是pip install PyMuPDF
  2. import uuid
  3. import os
  4. import argparse
  5. from tqdm import tqdm
  6. def findfiles(path):
  7. result = []
  8. # 首先遍历当前目录所有文件及文件夹
  9. file_list = os.listdir(path)
  10. # 循环判断每个元素是否是文件夹还是文件,是文件夹的话,递归
  11. for file in file_list:
  12. # 利用os.path.join()方法取得路径全名,并存入cur_path变量,否则每次只能遍历一层目录
  13. cur_path = os.path.join(path, file)
  14. # 判断是否是文件夹
  15. if os.path.isdir(cur_path):
  16. findfiles(cur_path)
  17. else:
  18. # 判断是否是特定文件名称
  19. if 'pdf' in file:
  20. # print(cur_path)
  21. result.append(cur_path)
  22. return result
  23. def pyMuPDF_fitz(pdfPath, imagePath):
  24. global count
  25. try:
  26. pdfDoc = fitz.open(pdfPath)
  27. cnt = 1
  28. for pg in range(pdfDoc.page_count):
  29. page = pdfDoc[pg]
  30. info = page.bound()
  31. radio = 0.0
  32. if info.width > info.height:
  33. radio = 800.0 / info.width
  34. else:
  35. radio = 800.0 / info.height
  36. rotate = int(0)
  37. zoom_x = 1.33333
  38. zoom_y = 1.333333
  39. mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
  40. try:
  41. pix = page.get_pixmap(matrix=mat, alpha=False)
  42. except Exception as e:
  43. print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
  44. continue
  45. if not os.path.exists(imagePath):
  46. os.makedirs(imagePath)
  47. file_name = os.path.splitext(os.path.basename(pdfPath))[0]
  48. save_path = os.path.join(imagePath, file_name + '-' + str(uuid.uuid1())[0:4] + '-' + str(cnt) + '.jpg')
  49. pix.save(save_path)
  50. count += 1
  51. cnt += 1
  52. except Exception as e:
  53. print(f"发生错误: {e}")
  54. pdfDoc.close()
  55. if __name__ == "__main__":
  56. parser = argparse.ArgumentParser()
  57. parser.add_argument('--pdf_path', type=str, default='')
  58. parser.add_argument('--pdf_dir', type=str, default='')
  59. parser.add_argument('--save_dir', type=str, default='./images')
  60. args = parser.parse_args()
  61. count = 0
  62. if not os.path.exists(args.save_dir):
  63. os.makedirs(args.save_dir)
  64. if args.pdf_path == '':
  65. result = findfiles(args.pdf_dir)
  66. file_count = len(result)
  67. for item in tqdm(result):
  68. pyMuPDF_fitz(item, args.save_dir)
  69. else:
  70. pyMuPDF_fitz(args.pdf_path, args.save_dir)