123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- from copy import deepcopy
- from docx import Document
- from docx import shared
- from docx.enum.text import WD_ALIGN_PARAGRAPH
- from docx.enum.section import WD_SECTION
- from docx.oxml.ns import qn
- from docx.enum.table import WD_TABLE_ALIGNMENT
- from ppstructure.recovery.table_process import HtmlToDocx
- from ppocr.utils.logging import get_logger
- logger = get_logger()
- def convert_info_docx(img, res, save_folder, img_name):
- doc = Document()
- doc.styles['Normal'].font.name = 'Times New Roman'
- doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
- doc.styles['Normal'].font.size = shared.Pt(6.5)
- flag = 1
- for i, region in enumerate(res):
- img_idx = region['img_idx']
- if flag == 2 and region['layout'] == 'single':
- section = doc.add_section(WD_SECTION.CONTINUOUS)
- section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
- flag = 1
- elif flag == 1 and region['layout'] == 'double':
- section = doc.add_section(WD_SECTION.CONTINUOUS)
- section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
- flag = 2
- if region['type'].lower() == 'figure':
- excel_save_folder = os.path.join(save_folder, img_name)
- img_path = os.path.join(excel_save_folder,
- '{}_{}.jpg'.format(region['bbox'], img_idx))
- paragraph_pic = doc.add_paragraph()
- paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
- run = paragraph_pic.add_run("")
- if flag == 1:
- run.add_picture(img_path, width=shared.Inches(5))
- elif flag == 2:
- run.add_picture(img_path, width=shared.Inches(2))
- elif region['type'].lower() == 'title':
- doc.add_heading(region['res'][0]['text'])
- elif region['type'].lower() == 'table':
- parser = HtmlToDocx()
- parser.table_style = 'TableGrid'
- parser.handle_table(region['res']['html'], doc)
- else:
- paragraph = doc.add_paragraph()
- paragraph_format = paragraph.paragraph_format
- for i, line in enumerate(region['res']):
- if i == 0:
- paragraph_format.first_line_indent = shared.Inches(0.25)
- text_run = paragraph.add_run(line['text'] + ' ')
- text_run.font.size = shared.Pt(10)
- # save to docx
- docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name))
- doc.save(docx_path)
- logger.info('docx save to {}'.format(docx_path))
- def sorted_layout_boxes(res, w):
- """
- Sort text boxes in order from top to bottom, left to right
- args:
- res(list):ppstructure results
- return:
- sorted results(list)
- """
- num_boxes = len(res)
- if num_boxes == 1:
- res[0]['layout'] = 'single'
- return res
- sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
- _boxes = list(sorted_boxes)
- new_res = []
- res_left = []
- res_right = []
- i = 0
- while True:
- if i >= num_boxes:
- break
- if i == num_boxes - 1:
- if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
- 'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
- new_res += res_left
- new_res += res_right
- _boxes[i]['layout'] = 'single'
- new_res.append(_boxes[i])
- else:
- if _boxes[i]['bbox'][2] > w / 2:
- _boxes[i]['layout'] = 'double'
- res_right.append(_boxes[i])
- new_res += res_left
- new_res += res_right
- elif _boxes[i]['bbox'][0] < w / 2:
- _boxes[i]['layout'] = 'double'
- res_left.append(_boxes[i])
- new_res += res_left
- new_res += res_right
- res_left = []
- res_right = []
- break
- elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4:
- _boxes[i]['layout'] = 'double'
- res_left.append(_boxes[i])
- i += 1
- elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
- _boxes[i]['layout'] = 'double'
- res_right.append(_boxes[i])
- i += 1
- else:
- new_res += res_left
- new_res += res_right
- _boxes[i]['layout'] = 'single'
- new_res.append(_boxes[i])
- res_left = []
- res_right = []
- i += 1
- if res_left:
- new_res += res_left
- if res_right:
- new_res += res_right
- return new_res
|