# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import os import sys import platform import cv2 import numpy as np import paddle from PIL import Image, ImageDraw, ImageFont import math from paddle import inference import time import random from ppocr.utils.logging import get_logger def str2bool(v): return v.lower() in ("true", "t", "1") def init_args(): parser = argparse.ArgumentParser() # params for prediction engine parser.add_argument("--use_gpu", type=str2bool, default=True) parser.add_argument("--use_xpu", type=str2bool, default=False) parser.add_argument("--use_npu", type=str2bool, default=False) parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) parser.add_argument("--min_subgraph_size", type=int, default=15) parser.add_argument("--precision", type=str, default="fp32") parser.add_argument("--gpu_mem", type=int, default=500) parser.add_argument("--gpu_id", type=int, default=0) # params for text detector parser.add_argument("--image_dir", type=str) parser.add_argument("--page_num", type=int, default=0) parser.add_argument("--det_algorithm", type=str, default='DB') parser.add_argument("--det_model_dir", type=str) parser.add_argument("--det_limit_side_len", type=float, default=960) parser.add_argument("--det_limit_type", type=str, default='max') parser.add_argument("--det_box_type", type=str, default='quad') # DB parmas parser.add_argument("--det_db_thresh", type=float, default=0.3) parser.add_argument("--det_db_box_thresh", type=float, default=0.6) parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) parser.add_argument("--max_batch_size", type=int, default=10) parser.add_argument("--use_dilation", type=str2bool, default=False) parser.add_argument("--det_db_score_mode", type=str, default="fast") # EAST parmas parser.add_argument("--det_east_score_thresh", type=float, default=0.8) parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) # SAST parmas parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) # PSE parmas parser.add_argument("--det_pse_thresh", type=float, default=0) parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) parser.add_argument("--det_pse_min_area", type=float, default=16) parser.add_argument("--det_pse_scale", type=int, default=1) # FCE parmas parser.add_argument("--scales", type=list, default=[8, 16, 32]) parser.add_argument("--alpha", type=float, default=1.0) parser.add_argument("--beta", type=float, default=1.0) parser.add_argument("--fourier_degree", type=int, default=5) # params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet') parser.add_argument("--rec_model_dir", type=str) parser.add_argument("--rec_image_inverse", type=str2bool, default=True) parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") parser.add_argument("--rec_batch_num", type=int, default=6) parser.add_argument("--max_text_length", type=int, default=25) parser.add_argument( "--rec_char_dict_path", type=str, default="./ppocr/utils/ppocr_keys_v1.txt") parser.add_argument("--use_space_char", type=str2bool, default=True) parser.add_argument( "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf") parser.add_argument("--drop_score", type=float, default=0.5) # params for e2e parser.add_argument("--e2e_algorithm", type=str, default='PGNet') parser.add_argument("--e2e_model_dir", type=str) parser.add_argument("--e2e_limit_side_len", type=float, default=768) parser.add_argument("--e2e_limit_type", type=str, default='max') # PGNet parmas parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) parser.add_argument( "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt") parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext') parser.add_argument("--e2e_pgnet_mode", type=str, default='fast') # params for text classifier parser.add_argument("--use_angle_cls", type=str2bool, default=False) parser.add_argument("--cls_model_dir", type=str) parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") parser.add_argument("--label_list", type=list, default=['0', '180']) parser.add_argument("--cls_batch_num", type=int, default=6) parser.add_argument("--cls_thresh", type=float, default=0.9) parser.add_argument("--enable_mkldnn", type=str2bool, default=False) parser.add_argument("--cpu_threads", type=int, default=10) parser.add_argument("--use_pdserving", type=str2bool, default=False) parser.add_argument("--warmup", type=str2bool, default=False) # SR parmas parser.add_argument("--sr_model_dir", type=str) parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128") parser.add_argument("--sr_batch_num", type=int, default=1) # parser.add_argument( "--draw_img_save_dir", type=str, default="./inference_results") parser.add_argument("--save_crop_res", type=str2bool, default=False) parser.add_argument("--crop_res_save_dir", type=str, default="./output") # multi-process parser.add_argument("--use_mp", type=str2bool, default=False) parser.add_argument("--total_process_num", type=int, default=1) parser.add_argument("--process_id", type=int, default=0) parser.add_argument("--benchmark", type=str2bool, default=False) parser.add_argument("--save_log_path", type=str, default="./log_output/") parser.add_argument("--show_log", type=str2bool, default=True) parser.add_argument("--use_onnx", type=str2bool, default=False) return parser def parse_args(): parser = init_args() return parser.parse_args() def create_predictor(args, mode, logger): if mode == "det": model_dir = args.det_model_dir elif mode == 'cls': model_dir = args.cls_model_dir elif mode == 'rec': model_dir = args.rec_model_dir elif mode == 'table': model_dir = args.table_model_dir elif mode == 'ser': model_dir = args.ser_model_dir elif mode == 're': model_dir = args.re_model_dir elif mode == "sr": model_dir = args.sr_model_dir elif mode == 'layout': model_dir = args.layout_model_dir else: model_dir = args.e2e_model_dir if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) if args.use_onnx: import onnxruntime as ort model_file_path = model_dir if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format( model_file_path)) sess = ort.InferenceSession(model_file_path) return sess, sess.get_inputs()[0], None, None else: file_names = ['model', 'inference'] for file_name in file_names: model_file_path = '{}/{}.pdmodel'.format(model_dir, file_name) params_file_path = '{}/{}.pdiparams'.format(model_dir, file_name) if os.path.exists(model_file_path) and os.path.exists( params_file_path): break if not os.path.exists(model_file_path): raise ValueError( "not find model.pdmodel or inference.pdmodel in {}".format( model_dir)) if not os.path.exists(params_file_path): raise ValueError( "not find model.pdiparams or inference.pdiparams in {}".format( model_dir)) config = inference.Config(model_file_path, params_file_path) if hasattr(args, 'precision'): if args.precision == "fp16" and args.use_tensorrt: precision = inference.PrecisionType.Half elif args.precision == "int8": precision = inference.PrecisionType.Int8 else: precision = inference.PrecisionType.Float32 else: precision = inference.PrecisionType.Float32 if args.use_gpu: gpu_id = get_infer_gpuid() if gpu_id is None: logger.warning( "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson." ) config.enable_use_gpu(args.gpu_mem, args.gpu_id) if args.use_tensorrt: config.enable_tensorrt_engine( workspace_size=1 << 30, precision_mode=precision, max_batch_size=args.max_batch_size, min_subgraph_size=args. min_subgraph_size, # skip the minmum trt subgraph use_calib_mode=False) # collect shape trt_shape_f = os.path.join(model_dir, f"{mode}_trt_dynamic_shape.txt") if not os.path.exists(trt_shape_f): config.collect_shape_range_info(trt_shape_f) logger.info( f"collect dynamic shape info into : {trt_shape_f}") try: config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, True) except Exception as E: logger.info(E) logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!") elif args.use_npu: config.enable_npu() elif args.use_xpu: config.enable_xpu(10 * 1024 * 1024) else: config.disable_gpu() if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() if args.precision == "fp16": config.enable_mkldnn_bfloat16() if hasattr(args, "cpu_threads"): config.set_cpu_math_library_num_threads(args.cpu_threads) else: # default cpu threads as 10 config.set_cpu_math_library_num_threads(10) # enable memory optim config.enable_memory_optim() config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") config.delete_pass("matmul_transpose_reshape_fuse_pass") if mode == 're': config.delete_pass("simplify_with_basic_ops_pass") if mode == 'table': config.delete_pass("fc_fuse_pass") # not supported for table config.switch_use_feed_fetch_ops(False) config.switch_ir_optim(True) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() if mode in ['ser', 're']: input_tensor = [] for name in input_names: input_tensor.append(predictor.get_input_handle(name)) else: for name in input_names: input_tensor = predictor.get_input_handle(name) output_tensors = get_output_tensors(args, mode, predictor) return predictor, input_tensor, output_tensors, config def get_output_tensors(args, mode, predictor): output_names = predictor.get_output_names() output_tensors = [] if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]: output_name = 'softmax_0.tmp_0' if output_name in output_names: return [predictor.get_output_handle(output_name)] else: for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) else: for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return output_tensors def get_infer_gpuid(): sysstr = platform.system() if sysstr == "Windows": return 0 if not paddle.fluid.core.is_compiled_with_rocm(): cmd = "env | grep CUDA_VISIBLE_DEVICES" else: cmd = "env | grep HIP_VISIBLE_DEVICES" env_cuda = os.popen(cmd).readlines() if len(env_cuda) == 0: return 0 else: gpu_id = env_cuda[0].strip().split("=")[1] return int(gpu_id[0]) def draw_e2e_res(dt_boxes, strs, img_path): src_im = cv2.imread(img_path) for box, str in zip(dt_boxes, strs): box = box.astype(np.int32).reshape((-1, 1, 2)) cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) cv2.putText( src_im, str, org=(int(box[0, 0, 0]), int(box[0, 0, 1])), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.7, color=(0, 255, 0), thickness=1) return src_im def draw_text_det_res(dt_boxes, img): for box in dt_boxes: box = np.array(box).astype(np.int32).reshape(-1, 2) cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2) return img def resize_img(img, input_size=600): """ resize img and limit the longest side of the image to input_size """ img = np.array(img) im_shape = img.shape im_size_max = np.max(im_shape[0:2]) im_scale = float(input_size) / float(im_size_max) img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) return img def draw_ocr(image, boxes, txts=None, scores=None, drop_score=0.5, font_path="./doc/fonts/simfang.ttf"): """ Visualize the results of OCR detection and recognition args: image(Image|array): RGB image boxes(list): boxes with shape(N, 4, 2) txts(list): the texts scores(list): txxs corresponding scores drop_score(float): only scores greater than drop_threshold will be visualized font_path: the path of font which is used to draw text return(array): the visualized img """ if scores is None: scores = [1] * len(boxes) box_num = len(boxes) for i in range(box_num): if scores is not None and (scores[i] < drop_score or math.isnan(scores[i])): continue box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64) image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) if txts is not None: img = np.array(resize_img(image, input_size=600)) txt_img = text_visual( txts, scores, img_h=img.shape[0], img_w=600, threshold=drop_score, font_path=font_path) img = np.concatenate([np.array(img), np.array(txt_img)], axis=1) return img return image def draw_ocr_box_txt(image, boxes, txts=None, scores=None, drop_score=0.5, font_path="./doc/fonts/simfang.ttf"): h, w = image.height, image.width img_left = image.copy() img_right = np.ones((h, w, 3), dtype=np.uint8) * 255 random.seed(0) draw_left = ImageDraw.Draw(img_left) if txts is None or len(txts) != len(boxes): txts = [None] * len(boxes) for idx, (box, txt) in enumerate(zip(boxes, txts)): if scores is not None and scores[idx] < drop_score: continue color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) draw_left.polygon(box, fill=color) img_right_text = draw_box_txt_fine((w, h), box, txt, font_path) pts = np.array(box, np.int32).reshape((-1, 1, 2)) cv2.polylines(img_right_text, [pts], True, color, 1) img_right = cv2.bitwise_and(img_right, img_right_text) img_left = Image.blend(image, img_left, 0.5) img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) img_show.paste(img_left, (0, 0, w, h)) img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h)) return np.array(img_show) def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"): box_height = int( math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][1])**2)) box_width = int( math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][1])**2)) if box_height > 2 * box_width and box_height > 30: img_text = Image.new('RGB', (box_height, box_width), (255, 255, 255)) draw_text = ImageDraw.Draw(img_text) if txt: font = create_font(txt, (box_height, box_width), font_path) draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) img_text = img_text.transpose(Image.ROTATE_270) else: img_text = Image.new('RGB', (box_width, box_height), (255, 255, 255)) draw_text = ImageDraw.Draw(img_text) if txt: font = create_font(txt, (box_width, box_height), font_path) draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) pts1 = np.float32( [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]) pts2 = np.array(box, dtype=np.float32) M = cv2.getPerspectiveTransform(pts1, pts2) img_text = np.array(img_text, dtype=np.uint8) img_right_text = cv2.warpPerspective( img_text, M, img_size, flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=(255, 255, 255)) return img_right_text def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"): font_size = int(sz[1] * 0.99) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") length = font.getsize(txt)[0] if length > sz[0]: font_size = int(font_size * sz[0] / length) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") return font def str_count(s): """ Count the number of Chinese characters, a single English character and a single number equal to half the length of Chinese characters. args: s(string): the input of string return(int): the number of Chinese characters """ import string count_zh = count_pu = 0 s_len = len(s) en_dg_count = 0 for c in s: if c in string.ascii_letters or c.isdigit() or c.isspace(): en_dg_count += 1 elif c.isalpha(): count_zh += 1 else: count_pu += 1 return s_len - math.ceil(en_dg_count / 2) def text_visual(texts, scores, img_h=400, img_w=600, threshold=0., font_path="./doc/simfang.ttf"): """ create new blank img and draw txt on it args: texts(list): the text will be draw scores(list|None): corresponding score of each txt img_h(int): the height of blank img img_w(int): the width of blank img font_path: the path of font which is used to draw text return(array): """ if scores is not None: assert len(texts) == len( scores), "The number of txts and corresponding scores must match" def create_blank_img(): blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255 blank_img[:, img_w - 1:] = 0 blank_img = Image.fromarray(blank_img).convert("RGB") draw_txt = ImageDraw.Draw(blank_img) return blank_img, draw_txt blank_img, draw_txt = create_blank_img() font_size = 20 txt_color = (0, 0, 0) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") gap = font_size + 5 txt_img_list = [] count, index = 1, 0 for idx, txt in enumerate(texts): index += 1 if scores[idx] < threshold or math.isnan(scores[idx]): index -= 1 continue first_line = True while str_count(txt) >= img_w // font_size - 4: tmp = txt txt = tmp[:img_w // font_size - 4] if first_line: new_txt = str(index) + ': ' + txt first_line = False else: new_txt = ' ' + txt draw_txt.text((0, gap * count), new_txt, txt_color, font=font) txt = tmp[img_w // font_size - 4:] if count >= img_h // gap - 1: txt_img_list.append(np.array(blank_img)) blank_img, draw_txt = create_blank_img() count = 0 count += 1 if first_line: new_txt = str(index) + ': ' + txt + ' ' + '%.3f' % (scores[idx]) else: new_txt = " " + txt + " " + '%.3f' % (scores[idx]) draw_txt.text((0, gap * count), new_txt, txt_color, font=font) # whether add new blank img or not if count >= img_h // gap - 1 and idx + 1 < len(texts): txt_img_list.append(np.array(blank_img)) blank_img, draw_txt = create_blank_img() count = 0 count += 1 txt_img_list.append(np.array(blank_img)) if len(txt_img_list) == 1: blank_img = np.array(txt_img_list[0]) else: blank_img = np.concatenate(txt_img_list, axis=1) return np.array(blank_img) def base64_to_cv2(b64str): import base64 data = base64.b64decode(b64str.encode('utf8')) data = np.frombuffer(data, np.uint8) data = cv2.imdecode(data, cv2.IMREAD_COLOR) return data def draw_boxes(image, boxes, scores=None, drop_score=0.5): if scores is None: scores = [1] * len(boxes) for (box, score) in zip(boxes, scores): if score < drop_score: continue box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64) image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) return image def get_rotate_crop_image(img, points): ''' img_height, img_width = img.shape[0:2] left = int(np.min(points[:, 0])) right = int(np.max(points[:, 0])) top = int(np.min(points[:, 1])) bottom = int(np.max(points[:, 1])) img_crop = img[top:bottom, left:right, :].copy() points[:, 0] = points[:, 0] - left points[:, 1] = points[:, 1] - top ''' assert len(points) == 4, "shape of points must be 4*2" img_crop_width = int( max( np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3]))) img_crop_height = int( max( np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2]))) pts_std = np.float32([[0, 0], [img_crop_width, 0], [img_crop_width, img_crop_height], [0, img_crop_height]]) M = cv2.getPerspectiveTransform(points, pts_std) dst_img = cv2.warpPerspective( img, M, (img_crop_width, img_crop_height), borderMode=cv2.BORDER_REPLICATE, flags=cv2.INTER_CUBIC) dst_img_height, dst_img_width = dst_img.shape[0:2] if dst_img_height * 1.0 / dst_img_width >= 1.5: dst_img = np.rot90(dst_img) return dst_img def get_minarea_rect_crop(img, points): bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32)) points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) index_a, index_b, index_c, index_d = 0, 1, 2, 3 if points[1][1] > points[0][1]: index_a = 0 index_d = 1 else: index_a = 1 index_d = 0 if points[3][1] > points[2][1]: index_b = 2 index_c = 3 else: index_b = 3 index_c = 2 box = [points[index_a], points[index_b], points[index_c], points[index_d]] crop_img = get_rotate_crop_image(img, np.array(box)) return crop_img def check_gpu(use_gpu): if use_gpu and not paddle.is_compiled_with_cuda(): use_gpu = False return use_gpu if __name__ == '__main__': pass