12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- import fitz # fitz就是pip install PyMuPDF
- import uuid
- import os
- import argparse
- from tqdm import tqdm
- def findfiles(path):
- result = []
- # 首先遍历当前目录所有文件及文件夹
- file_list = os.listdir(path)
- # 循环判断每个元素是否是文件夹还是文件,是文件夹的话,递归
- for file in file_list:
- # 利用os.path.join()方法取得路径全名,并存入cur_path变量,否则每次只能遍历一层目录
- cur_path = os.path.join(path, file)
- # 判断是否是文件夹
- if os.path.isdir(cur_path):
- findfiles(cur_path)
- else:
- # 判断是否是特定文件名称
- if 'pdf' in file:
- # print(cur_path)
- result.append(cur_path)
- return result
- def pyMuPDF_fitz(pdfPath, imagePath):
- global count
- pdfDoc = fitz.open(pdfPath)
- cnt = 1
- for pg in range(pdfDoc.page_count):
- page = pdfDoc[pg]
- info = page.bound()
- radio = 0.0
- if info.width > info.height:
- radio = 800.0 / info.width
- else:
- radio = 800.0 / info.height
- rotate = int(0)
- zoom_x = 1.33333
- zoom_y = 1.333333
- mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
- try:
- pix = page.get_pixmap(matrix=mat, alpha=False)
- except Exception as e:
- print("imagePath=" + pdfPath + " ---------------- ", e.__class__.__name__, e)
- continue
- if not os.path.exists(imagePath):
- os.makedirs(imagePath)
- pix.save(imagePath + '/' + str(pdfPath).split('\\')[-1][0:-4] + '-' + str(uuid.uuid1())[0:4] + '-' + str(cnt) + '.jpg')
- count += 1
- cnt += 1
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument('--pdf_path', type=str, default='')
- parser.add_argument('--pdf_dir', type=str, default='')
- parser.add_argument('--save_dir', type=str, default='./images')
- args = parser.parse_args()
- count = 0
- if not os.path.exists(args.save_dir):
- os.makedirs(args.save_dir)
- if args.pdf_path == '':
- result = findfiles(args.pdf_dir)
- file_count = len(result)
- for item in tqdm(result):
- pyMuPDF_fitz(item, args.save_dir)
- else:
- pyMuPDF_fitz(args.pdf_path, args.save_dir)
|