recovery_to_doc.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from copy import deepcopy
  16. from docx import Document
  17. from docx import shared
  18. from docx.enum.text import WD_ALIGN_PARAGRAPH
  19. from docx.enum.section import WD_SECTION
  20. from docx.oxml.ns import qn
  21. from docx.enum.table import WD_TABLE_ALIGNMENT
  22. from ppstructure.recovery.table_process import HtmlToDocx
  23. from ppocr.utils.logging import get_logger
  24. logger = get_logger()
  25. def convert_info_docx(img, res, save_folder, img_name):
  26. doc = Document()
  27. doc.styles['Normal'].font.name = 'Times New Roman'
  28. doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
  29. doc.styles['Normal'].font.size = shared.Pt(6.5)
  30. flag = 1
  31. for i, region in enumerate(res):
  32. img_idx = region['img_idx']
  33. if flag == 2 and region['layout'] == 'single':
  34. section = doc.add_section(WD_SECTION.CONTINUOUS)
  35. section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
  36. flag = 1
  37. elif flag == 1 and region['layout'] == 'double':
  38. section = doc.add_section(WD_SECTION.CONTINUOUS)
  39. section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
  40. flag = 2
  41. if region['type'].lower() == 'figure':
  42. excel_save_folder = os.path.join(save_folder, img_name)
  43. img_path = os.path.join(excel_save_folder,
  44. '{}_{}.jpg'.format(region['bbox'], img_idx))
  45. paragraph_pic = doc.add_paragraph()
  46. paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
  47. run = paragraph_pic.add_run("")
  48. if flag == 1:
  49. run.add_picture(img_path, width=shared.Inches(5))
  50. elif flag == 2:
  51. run.add_picture(img_path, width=shared.Inches(2))
  52. elif region['type'].lower() == 'title':
  53. doc.add_heading(region['res'][0]['text'])
  54. elif region['type'].lower() == 'table':
  55. parser = HtmlToDocx()
  56. parser.table_style = 'TableGrid'
  57. parser.handle_table(region['res']['html'], doc)
  58. else:
  59. paragraph = doc.add_paragraph()
  60. paragraph_format = paragraph.paragraph_format
  61. for i, line in enumerate(region['res']):
  62. if i == 0:
  63. paragraph_format.first_line_indent = shared.Inches(0.25)
  64. text_run = paragraph.add_run(line['text'] + ' ')
  65. text_run.font.size = shared.Pt(10)
  66. # save to docx
  67. docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name))
  68. doc.save(docx_path)
  69. logger.info('docx save to {}'.format(docx_path))
  70. def sorted_layout_boxes(res, w):
  71. """
  72. Sort text boxes in order from top to bottom, left to right
  73. args:
  74. res(list):ppstructure results
  75. return:
  76. sorted results(list)
  77. """
  78. num_boxes = len(res)
  79. if num_boxes == 1:
  80. res[0]['layout'] = 'single'
  81. return res
  82. sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
  83. _boxes = list(sorted_boxes)
  84. new_res = []
  85. res_left = []
  86. res_right = []
  87. i = 0
  88. while True:
  89. if i >= num_boxes:
  90. break
  91. if i == num_boxes - 1:
  92. if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
  93. 'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
  94. new_res += res_left
  95. new_res += res_right
  96. _boxes[i]['layout'] = 'single'
  97. new_res.append(_boxes[i])
  98. else:
  99. if _boxes[i]['bbox'][2] > w / 2:
  100. _boxes[i]['layout'] = 'double'
  101. res_right.append(_boxes[i])
  102. new_res += res_left
  103. new_res += res_right
  104. elif _boxes[i]['bbox'][0] < w / 2:
  105. _boxes[i]['layout'] = 'double'
  106. res_left.append(_boxes[i])
  107. new_res += res_left
  108. new_res += res_right
  109. res_left = []
  110. res_right = []
  111. break
  112. elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4:
  113. _boxes[i]['layout'] = 'double'
  114. res_left.append(_boxes[i])
  115. i += 1
  116. elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
  117. _boxes[i]['layout'] = 'double'
  118. res_right.append(_boxes[i])
  119. i += 1
  120. else:
  121. new_res += res_left
  122. new_res += res_right
  123. _boxes[i]['layout'] = 'single'
  124. new_res.append(_boxes[i])
  125. res_left = []
  126. res_right = []
  127. i += 1
  128. if res_left:
  129. new_res += res_left
  130. if res_right:
  131. new_res += res_right
  132. return new_res