pdf_to_image.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import fitz # fitz就是pip install PyMuPDF
  2. import uuid
  3. import os
  4. import argparse
  5. from tqdm import tqdm
  6. def findfiles(path):
  7. result = []
  8. # 首先遍历当前目录所有文件及文件夹
  9. file_list = os.listdir(path)
  10. # 循环判断每个元素是否是文件夹还是文件,是文件夹的话,递归
  11. for file in file_list:
  12. # 利用os.path.join()方法取得路径全名,并存入cur_path变量,否则每次只能遍历一层目录
  13. cur_path = os.path.join(path, file)
  14. # 判断是否是文件夹
  15. if os.path.isdir(cur_path):
  16. findfiles(cur_path)
  17. else:
  18. # 判断是否是特定文件名称
  19. if 'pdf' in file:
  20. # print(cur_path)
  21. result.append(cur_path)
  22. return result
  23. def pyMuPDF_fitz(pdfPath, imagePath):
  24. global count
  25. pdfDoc = fitz.open(pdfPath)
  26. cnt = 1
  27. for pg in range(pdfDoc.page_count):
  28. page = pdfDoc[pg]
  29. info = page.bound()
  30. radio = 0.0
  31. if info.width > info.height:
  32. radio = 800.0 / info.width
  33. else:
  34. radio = 800.0 / info.height
  35. rotate = int(0)
  36. zoom_x = 1.33333
  37. zoom_y = 1.333333
  38. mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
  39. try:
  40. pix = page.get_pixmap(matrix=mat, alpha=False)
  41. except Exception as e:
  42. print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
  43. continue
  44. if not os.path.exists(imagePath):
  45. os.makedirs(imagePath)
  46. pix.save(imagePath + '/' + str(pdfPath).split('\\')[-1][0:-4] + '-' + str(uuid.uuid1())[0:4] + '-' + str(cnt) + '.jpg')
  47. count += 1
  48. cnt += 1
  49. if __name__ == "__main__":
  50. parser = argparse.ArgumentParser()
  51. parser.add_argument('--pdf_path', type=str, default='')
  52. parser.add_argument('--pdf_dir', type=str, default='')
  53. parser.add_argument('--save_dir', type=str, default='./images')
  54. args = parser.parse_args()
  55. count = 0
  56. if not os.path.exists(args.save_dir):
  57. os.makedirs(args.save_dir)
  58. if args.pdf_path == '':
  59. result = findfiles(args.pdf_dir)
  60. file_count = len(result)
  61. for item in tqdm(result):
  62. pyMuPDF_fitz(item, args.save_dir)
  63. else:
  64. pyMuPDF_fitz(args.pdf_path, args.save_dir)