Procházet zdrojové kódy

提交PaddleOCR dygraph分支 6cbd7d1ecef832d428e21ef98c44382c5384d8f7

yangjun před 1 rokem
rodič
revize
5709eb762c
69 změnil soubory, kde provedl 404 přidání a 719 odebrání
  1. 1 2
      README.md
  2. 1 4
      README_ch.md
  3. 28 30
      applications/README.md
  4. 12 12
      applications/快速构建卡证类OCR.md
  5. 0 268
      applications/蒙古文书籍文字识别.md
  6. 1 0
      configs/det/det_res18_db_v2.0.yml
  7. 1 0
      configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml
  8. 1 0
      configs/table/table_mv3.yml
  9. 7 8
      deploy/avh/convert_image.py
  10. 1 1
      deploy/cpp_infer/readme.md
  11. 1 1
      deploy/cpp_infer/src/preprocess_op.cpp
  12. 1 1
      deploy/cpp_infer/src/utility.cpp
  13. 1 1
      deploy/hubserving/ocr_det/module.py
  14. 1 1
      deploy/hubserving/ocr_system/module.py
  15. 1 5
      deploy/paddlejs/README.md
  16. 1 5
      deploy/paddlejs/README_ch.md
  17. 1 0
      deploy/slim/quantization/README.md
  18. 0 2
      doc/doc_ch/PP-OCRv3_introduction.md
  19. 0 2
      doc/doc_ch/PP-OCRv3_det_train.md
  20. 0 2
      doc/doc_ch/algorithm_det_east.md
  21. 2 2
      doc/doc_ch/algorithm_det_sast.md
  22. 64 0
      doc/doc_ch/finetune.md
  23. 2 2
      doc/doc_ch/inference_args.md
  24. 2 2
      doc/doc_ch/inference_ppocr.md
  25. 1 2
      doc/doc_ch/quickstart.md
  26. 11 6
      doc/doc_ch/table_recognition.md
  27. 0 253
      doc/doc_en/PP-OCRv3_det_train_en.md
  28. 0 1
      doc/doc_en/PP-OCRv3_introduction_en.md
  29. 1 1
      doc/doc_en/algorithm_det_db_en.md
  30. 2 2
      doc/doc_en/algorithm_det_sast_en.md
  31. 1 1
      doc/doc_en/algorithm_overview_en.md
  32. 1 1
      doc/doc_en/algorithm_rec_vitstr_en.md
  33. 62 0
      doc/doc_en/finetune_en.md
  34. 2 2
      doc/doc_en/inference_args_en.md
  35. 23 25
      doc/doc_en/inference_en.md
  36. 1 0
      doc/doc_en/knowledge_distillation_en.md
  37. 1 1
      doc/doc_en/models_list_en.md
  38. 5 0
      doc/doc_en/table_recognition_en.md
  39. 1 1
      doc/doc_i18n/README_日本語.md
  40. 22 1
      paddleocr.py
  41. 7 6
      ppocr/data/imaug/abinet_aug.py
  42. 4 1
      ppocr/data/imaug/ct_process.py
  43. 3 1
      ppocr/data/imaug/drrg_targets.py
  44. 1 1
      ppocr/data/imaug/fce_aug.py
  45. 10 7
      ppocr/data/imaug/fce_targets.py
  46. 3 3
      ppocr/data/imaug/label_ops.py
  47. 3 11
      ppocr/modeling/heads/det_db_head.py
  48. 1 1
      ppocr/modeling/heads/proposal_local_graph.py
  49. 5 0
      ppocr/modeling/heads/rec_nrtr_head.py
  50. 3 4
      ppocr/postprocess/drrg_postprocess.py
  51. 3 2
      ppocr/postprocess/east_postprocess.py
  52. 2 2
      ppocr/postprocess/fce_postprocess.py
  53. 1 1
      ppocr/postprocess/rec_postprocess.py
  54. 2 0
      ppocr/postprocess/sast_postprocess.py
  55. 5 1
      ppocr/utils/e2e_metric/Deteval.py
  56. 1 1
      ppocr/utils/gen_label.py
  57. 23 0
      ppocr/utils/utility.py
  58. 13 0
      ppstructure/docs/quickstart.md
  59. 7 0
      ppstructure/kie/README.md
  60. 7 0
      ppstructure/kie/README_ch.md
  61. 1 1
      ppstructure/kie/requirements.txt
  62. 1 1
      ppstructure/table/predict_table.py
  63. 3 5
      requirements.txt
  64. 4 2
      test_tipc/prepare.sh
  65. 13 13
      test_tipc/supplementary/train.py
  66. 5 2
      test_tipc/test_serving_infer_cpp.sh
  67. 3 1
      tools/infer/predict_det.py
  68. 3 2
      tools/infer/predict_rec.py
  69. 3 2
      tools/train.py

+ 1 - 2
README.md

@@ -27,8 +27,7 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
 
 ## 📣 Recent updates
 - 🔨**2022.11 Add implementation of [4 cutting-edge algorithms](doc/doc_ch/algorithm_overview.md)**:Text Detection [DRRG](doc/doc_en/algorithm_det_drrg_en.md),  Text Recognition [RFL](./doc/doc_en/algorithm_rec_rfl_en.md), Image Super-Resolution [Text Telescope](doc/doc_en/algorithm_sr_telescope_en.md),Handwrittem Mathematical Expression Recognition [CAN](doc/doc_en/algorithm_rec_can_en.md)
-- **2022.10 Release [optimized JS version PP-OCRv3 model](./deploy/paddlejs/README.md)** with 4.3M model size, 8x faster inference time, and a ready-to-use web demo
-
+- **2022.10 release [optimized JS version PP-OCRv3 model](./deploy/paddlejs/README.md)** with 4.3M model size, 8x faster inference time, and a ready-to-use web demo
 - 💥 **Live Playback: Introduction to PP-StructureV2 optimization strategy**. Scan [the QR code below](#Community) using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, get the live link and 20G OCR learning materials (including PDF2Word application, 10 models in vertical scenarios, etc.)
 
 

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 1 - 4
README_ch.md


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 28 - 30
applications/README.md


+ 12 - 12
applications/快速构建卡证类OCR.md

@@ -223,7 +223,7 @@ AIStudio项目链接:[快速构建卡证类OCR](https://aistudio.baidu.com/ais
 
 2)获取并解压预训练模型,如果要使用其他模型可以从模型库里自主选择合适模型。
 ```
-!wget -P work/pre_trained/   https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar 
+!wget -P work/pre_trained/   https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
 !tar -vxf /home/aistudio/work/pre_trained/ch_PP-OCRv3_det_distill_train.tar -C /home/aistudio/work/pre_trained
 ```
 3) 安装必要依赖
@@ -275,7 +275,7 @@ AIStudio项目链接:[快速构建卡证类OCR](https://aistudio.baidu.com/ais
 
 ```python
 class DetLabelEncode(object):
-   
+
     # 修改检测标签的编码处,新增了参数分类数:num_classes,重写初始化方法,以及分类标签的读取
 
     def __init__(self, label_list, num_classes=8, **kwargs):
@@ -315,11 +315,11 @@ class DetLabelEncode(object):
                     classes.append(int(self.label_list.index(txt)))
 
         if len(boxes) == 0:
-            
+
             return None
         boxes = self.expand_points_num(boxes)
         boxes = np.array(boxes, dtype=np.float32)
-        txt_tags = np.array(txt_tags, dtype=np.bool)
+        txt_tags = np.array(txt_tags, dtype=np.bool_)
         classes = classes
         data['polys'] = boxes
         data['texts'] = txts
@@ -410,10 +410,10 @@ class MakeShrinkMap(object):
 
 
         data['shrink_map'] = gt
-       
+
         if self.num_classes > 1:
             data['class_mask'] = gt_class
-        
+
         data['shrink_mask'] = mask
         return data
 ```
@@ -634,10 +634,10 @@ class DBPostProcess(object):
         '''
         h, w = bitmap.shape[:2]
         box = _box.copy()
-        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
-        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
-        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
-        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
 
         mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
         box[:, 0] = box[:, 0] - xmin
@@ -752,11 +752,11 @@ class DBPostProcess(object):
 其他命令:
 ```
 !python /home/aistudio/work/PaddleOCR/tools/eval.py  -c  /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml
-!python /home/aistudio/work/PaddleOCR/tools/infer_det.py  -c  /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml 
+!python /home/aistudio/work/PaddleOCR/tools/infer_det.py  -c  /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml
 ```
 模型推理
 ```
-!python /home/aistudio/work/PaddleOCR/tools/infer/predict_det.py --image_dir="/home/aistudio/work/test_img/" --det_model_dir="/home/aistudio/work/PaddleOCR/output/infer" 
+!python /home/aistudio/work/PaddleOCR/tools/infer/predict_det.py --image_dir="/home/aistudio/work/test_img/" --det_model_dir="/home/aistudio/work/PaddleOCR/output/infer"
 ```
 
 ## 5 总结

+ 0 - 268
applications/蒙古文书籍文字识别.md

@@ -1,268 +0,0 @@
-# 蒙古文书籍文字识别
-
-本范例将使用OCR技术实现蒙古文书籍文字识别检测和识别,通过本章您可以掌握:
-
-- 蒙古文的基本知识
-- 如何制作蒙古文字典与合成数据
-- 如何进行识别模型微调
-
-本范例由内蒙古阿尔泰电子信息技术有限公司CTO欧日乐克、苏日图、达布希腊图、歆琪乐与飞桨联合打造
-
-## 背景介绍
-
-蒙古文文字识别技术在蒙古文信息处理领域成为一个亟待解决的问题。但由于诸多主客观原因,产品化蒙古文文字识别技术上有一段距离。而书籍的数字化是其中一项重要的课题,但因为涉及相关技术较多,难度较高,在整体上解决蒙古文书籍文字识别技术还不成熟。
-
-![pic_script](https://user-images.githubusercontent.com/50011306/206182800-a3029f08-dece-4cbe-9298-a66eb413137a.jpeg)
-
-*基本概念
-```txt
-字:即字符,不同于汉字的字(Character)一字一形,并包含字意,蒙古文的字类似于英语的
-字母(Letter)是一种表音文字,但却拥有若干字形(Glyph),若干个字构成一个词。
-
-名义字符:Unicode十大法则中规定,"Character is not Glyph"(字符不是字形)。
-于是像蒙古文一样有形态变化的文字就有了一个字形代表其他形态的字形。该字形被记录在
-Unicode基本表中并分配编码。如蒙古文 ᠠ([a]) 为 a 的独立形式,分配码位\u1820,
-记作uni1820.isol,代表所有词首uni1820.init、词中uni1820.medi和词尾1820.fina形式。
-
-变形显现字符:除名义字符外的其他没有实际编码位置的字形。在蒙古文文本中看到的字符皆为通过
-Unicode Scripts Processor(简称USP)处理后显示出来的字符。
-
-词:语言里最小的可以独立运用的单位,等同于英语的word,以空格分割的字符串。
-```
-
-### 项目难点 
-
-- 1.蒙古文字符的复杂性
-  
-  一形多字
-  
-  ![pic2](https://user-images.githubusercontent.com/50011306/206182327-b4a888a6-e67e-4d87-992d-0ddb830de85f.png)
-  
-  多字一形
-
-  ![pic3](https://user-images.githubusercontent.com/50011306/206182335-25b8c657-98da-4532-ae7d-608608a8f047.png)
-
-- 2.蒙古文排版方向(竖写、换行左->右)
-
-书写方向与换行方向会影响识别后的排序以及文字方向。
-
-  ![pic4](https://user-images.githubusercontent.com/50011306/206182347-c5e2525a-f1fd-4ee6-936c-946435b3fe6b.png)
-
-- 3.蒙古文字宽、行宽的不一致性
-
-
-  ![pic5](https://user-images.githubusercontent.com/50011306/206182391-431c2441-1d1d-4f25-931c-b0f663bf3285.png)
-
-
-- 4.字符中的部件难以区分(MVS、NNBSP点的处理,以及合体字形) 
-
-蒙古文中有一些有关形态变化的控制字符,其中最频繁出现的有 Mongolian Vowel Separator
-(MVS,\u180E),和 Narrow No-Break Space(NNBSP,\u202F)。该两个控制符正常
-情况下在文本中是透明的,比空格窄,不能换行。MVS用于连接词干与词尾a、e元音,NNBSP则
-用于连接词与词附加成分(可以理解成分写后缀)。MVS会引发双向形变,而NNBSP则会引发后位
-形变。
-
-此外,蒙古文中有一些字形为另一个字形的组成部件,导致识别时不好对应结果。
-
-
-针对以上问题, 本例选用PP-OCRv3这一开源超轻量OCR系统进行蒙古文文本识别系统的开发。我们首先使用数据合成工具合成了250万数据,基于这部分数据进行训练,通过精准切除白色边缘,随机加入标点符号,调整评估集数据使识别精度提升至75.78%。效果如下:
-
-| 策略 | 精度 %|
-| :--------------- | :-------- |
-| 合成数据训练 | 42.38|
-| 优化合成训练 | 75.78|
-
-具体流程为:
-- 第一步,选取真实语料并校对,并把语料副本转换为非Unicode编码版本
-- 第二步,选择多种字体生成,按行生成,生成时建议使用非Unicode字体生成
-- 第三步,从真实的扫描文本图片中按行切割保存,并保证对每个图进行Unicode编码的标注
-评估集数据均采用真实图片样本。
-- 第四步,开始训练
-- 第五部,识别文字  
-
-评估集数据的质量对模型的训练效率也起到很大的作用。
-
-## 快速体验
-### 环境搭建
-
-本任务基于Aistudio完成, 具体环境如下:
-
-- 操作系统: Linux
-- PaddlePaddle: 2.3
-- PaddleOCR: Release/2.5
-- text_renderer: master
-
-下载PaddlleOCR代码并安装依赖库:
-```bash
-git clone -b dygraph https://gitee.com/paddlepaddle/PaddleOCR
-
-# 安装依赖库
-cd PaddleOCR
-pip install -r PaddleOCR/requirements.txt
-```
-
-### 模型推理
-将下载或训练完成的模型放置在`PaddleOCR/output`下, 然后使用如下命令即可快速进行模型推理
-```bash
-python tools/infer_rec.py -c configs/rec/PP-OCRv3/multi_language/Mongolian_PP-OCRv3_rec.yml \
-                          -o Global.pretrained_model=output/v3_Mongolian_mobile/best_accuracy \ 
-                          Global.infer_img=doc/imgs_words/
-```
-
-<!-- #region -->
-## 数据准备
-
-本项目从真实语料中生成250万张图片作为训练集。另外生成1万张图片作为验证集。
-
-### 语料准备
-蒙古文由于编码原因生成图片时不能直接用 Unicode 字符串生成。蒙古文 Unicode 的本质是音码,伴随复杂的形态变化,如果没有对应的复杂文本处理程序则只能显示蒙古文的名义字符,而非变形显现字符。
-因此如果想生成蒙古文图片则需要: 1.调用Windows系统的 USP10.dll,2.用支持形码的字体生成。
-本项目使用了第二种方案,即使用形码的字体生成图片,并对应 Unicode 标签。
-
-直接使用 Unicode 生成的情况(字符会分开并以名义字符显示):
-
-![pic8](https://user-images.githubusercontent.com/50011306/206183135-d8be1ff7-4e3b-404f-bf5c-c0b47d5d4718.png)
-
-$$\mbox{左列为Unicode生成图片,右列为Unicode文本}$$
-
-![pic9](https://user-images.githubusercontent.com/50011306/206183154-3aec2415-66fb-41b8-872d-49aad4b62113.png)
-
-$$\mbox{左列为Unicode文本,右列为形码生成图片}$$
-
-生成图片时建议将字符串长度保持在5个词(平均30个字符),否则训练较为困难。
-
-### 图片处理
-
-部分训练图片示例如下:
-
-![pic6](https://user-images.githubusercontent.com/50011306/206182740-d7e38be8-e857-45a4-8639-2a8656c9f8e5.png)
-
-为验证模型对实际图片的效果,验证图片采用了真实扫描图片。在扫描完整的页面后对
-
-标签文件格式如下:
-
-<img src='https://ai-studio-static-online.cdn.bcebos.com/c7d98953fba24ed28a8f4e189b9d7cf81babdacc3fc3465b9cb65d09691dd4c8' width='800'>
-
-
-|数据集类型|数量|
-|---|---|
-|训练集| 250万|
-|验证集| 1.1万|
-<!-- #endregion -->
-
-<!-- #region -->
-数据文件结构如下:
-
-```txt
-PaddleOCRv3
-├── train_data               # 训练数据文件夹
-│   ├── texts           
-│   │   ├── train1.txt       # 生成的训练数据标签,与图片文档一一对应
-│   │   ├── train2.txt         
-│   │   ├── train3.txt       
-│   │   ├── train4.txt           
-│   │   ├── train11.txt       
-│   │   ├── train20.txt           
-│   │   ├── train21.txt        
-│   │   └── train22.txt   
-│   ├── image1               # 生成的训练图片
-│   ├── image2            
-│   ├── image3        
-│   ├── image4            
-│   ├── image11       
-│   ├── image20            
-│   ├── image21       
-│   └── image22              
-├── test_data                # 验证数据文件夹
-│   ├── test_data.txt        # 验证数据标签
-│   ├── 0                    # 每个文件夹有34张图片
-│   ├── 1                    
-:   :
-:   :
-│   └── 409
-```
-### 制作字典
-
-根据 Unicode 编码顺序制作一个包含所有蒙古文字符的文本字典,建议保存到./ppocr/utils/dict目录下面,并在yml文件中更改地址。
-
-<img src='https://ai-studio-static-online.cdn.bcebos.com/825976d0134c4b94a07ca2c8249d8d53f6f5834453cd4fb093d9fa8bc644cd4f' width='200'>
-
-
-## 基于合成数据训练
-###  模型训练和评估
-
-准备好合成数据后,我们可以使用以下命令训练数据:
-<!-- #endregion -->
-```bash
-cd ${PaddleOCR_root}
-python tools/train.py -c configs/rec/PP-OCRv3/multi_language/Mongolian_PP-OCRv3_rec.yml
-```
-如果想从断点继续训练:
-```bash
-cd ${PaddleOCR_root}
-python tools/train.py -c configs/rec/PP-OCRv3/multi_language/Mongolian_PP-OCRv3_rec.yml \
-                      -o Global.checkpoints=./output/v3_Mongolian_mobile/best_accuracy
-```
-可填各参数含义如下:
-
-```txt
--c: 指定使用的配置文件,Mongolian_PP-OCRv3_rec.yml对应于OCRv3识别模型。
--o: 覆盖配置文件中参数
-Global.pretrained_model: 指定使用的预训练模型
-Global.checkpoints: 断点位置
-Global.epoch_num: 指定训练的epoch数
-Global.eval_batch_step: 间隔多少step做一次评估
-Train.dataset.data_dir: 训练数据集路径
-Train.dataset.label_file_list: 训练集文件列表
-Train.loader.batch_size_per_card: 训练单卡batch size
-Eval.dataset.data_dir: 评估数据集路径
-Eval.dataset.label_file_list: 评估数据集文件列表
-Eval.loader.batch_size_per_card: 评估单卡batch size
-```
-
-###  模型推测
-训练好的模型推测如下:
-```bash
-python tools/infer_rec.py -c configs/rec/PP-OCRv3/multi_language/Mongolian_PP-OCRv3_rec.yml \
-                          -o Global.pretrained_model=output/v3_Mongolian_mobile/best_accuracy \ 
-                          Global.infer_img=doc/imgs_words/
-```
-## 用真实数据测试模型
-
-训练完成后可以测试模型。可将测试图片指定到某文件夹:
-```shell
-PaddleOCRv3
-├── doc              
-├── imgs_words                
-│   ├── arabic       
-│   ├── belarusian                   
-│   ├── bulgarian                    
-:   :
-:   :
-│   ├── mongolian      # 在此放入真实蒙古文图片,一个图片一行
-│   └── uyghur
-```
-快速推测
-
-```bash
-python tools/eval.py -c configs/rec/PP-OCRv3/multi_language/Mongolian_PP-OCRv3_rec.yml \
-                     -o Global.checkpoints=./output/v3_Mongolian_mobile/best_accuracy
-```
-推测结果将被记录在predicts_ppocrv3_Mongolian.txt文件中。
-
-```shell
-PaddleOCRv3          
-├── output                
-│   ├── rec       
-│   │   └── predicts_ppocrv3_Mongolian.txt
-│   └── v3_Mongolian_mobile
-```
-
-部分结果:三列分别为推测结果、真实标签、图片
-
-![pic7](https://user-images.githubusercontent.com/50011306/206182924-57472dc7-fd74-4872-8466-15c05eeb369d.png)
-
-
-## 总结
-
-本例选用PP-OCRv3这一开源超轻量OCR系统进行蒙古文文本识别系统的开发。加入250万合成数据,在现有模型基础上进行微调,通过修正训练集,设定评估标准,最终将蒙古文识别精度从42%提升至75%。

+ 1 - 0
configs/det/det_res18_db_v2.0.yml

@@ -22,6 +22,7 @@ Architecture:
   Backbone:
     name: ResNet_vd
     layers: 18
+    disable_se: True
   Neck:
     name: DBFPN
     out_channels: 256

+ 1 - 0
configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml

@@ -19,6 +19,7 @@ Global:
   use_space_char: true
   distributed: true
   save_res_path: ./output/rec/predicts_pp-OCRv2_distillation.txt
+  amp_custom_black_list: ['matmul','matmul_v2','elementwise_add']
 
 
 Optimizer:

+ 1 - 0
configs/table/table_mv3.yml

@@ -20,6 +20,7 @@ Global:
   max_text_length: &max_text_length 500
   box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy'
   infer_mode: False
+  amp_custom_black_list: ['matmul_v2','elementwise_add']
 
 Optimizer:
   name: Adam

+ 7 - 8
deploy/avh/convert_image.py

@@ -24,6 +24,7 @@ import math
 from PIL import Image
 import numpy as np
 
+
 def resize_norm_img(img, image_shape, padding=True):
     imgC, imgH, imgW = image_shape
     h = img.shape[0]
@@ -61,9 +62,8 @@ def create_header_file(name, tensor_name, tensor_data, output_path):
     raw_path = file_path.with_suffix(".h").resolve()
     with open(raw_path, "w") as header_file:
         header_file.write(
-            "\n"
-            + f"const size_t {tensor_name}_len = {tensor_data.size};\n"
-            + f'__attribute__((section(".data.tvm"), aligned(16))) float {tensor_name}[] = '
+            "\n" + f"const size_t {tensor_name}_len = {tensor_data.size};\n" +
+            f'__attribute__((section(".data.tvm"), aligned(16))) float {tensor_name}[] = '
         )
 
         header_file.write("{")
@@ -80,22 +80,21 @@ def create_headers(image_name):
 
     # Resize image to 32x320
     img = cv2.imread(img_path)
-    img = resize_norm_img(img, [3,32,320])
+    img = resize_norm_img(img, [3, 32, 320])
     img_data = img.astype("float32")
-    
+
     # # Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
     img_data = np.expand_dims(img_data, axis=0)
 
     # Create input header file
     create_header_file("inputs", "input", img_data, "./include")
     # Create output header file
-    output_data = np.zeros([7760], np.float)
+    output_data = np.zeros([7760], np.float32)
     create_header_file(
         "outputs",
         "output",
         output_data,
-        "./include",
-    )
+        "./include", )
 
 
 if __name__ == "__main__":

+ 1 - 1
deploy/cpp_infer/readme.md

@@ -158,7 +158,7 @@ build/paddle_inference_install_dir/
 <a name="21"></a>
 ### 2.1 Export the inference model
 
-* You can refer to [Model inference](../../doc/doc_en/inference_en.md) and export the inference model. After the model is exported, assuming it is placed in the `inference` directory, the directory structure is as follows.
+* You can refer to [Model inference](../../doc/doc_ch/inference.md) and export the inference model. After the model is exported, assuming it is placed in the `inference` directory, the directory structure is as follows.
 
 ```
 inference/

+ 1 - 1
deploy/cpp_infer/src/preprocess_op.cpp

@@ -112,7 +112,7 @@ void CrnnResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, float wh_ratio,
              cv::INTER_LINEAR);
   cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0,
                      int(imgW - resize_img.cols), cv::BORDER_CONSTANT,
-                     {127, 127, 127});
+                     {0, 0, 0});
 }
 
 void ClsResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img,

+ 1 - 1
deploy/cpp_infer/src/utility.cpp

@@ -308,7 +308,7 @@ void Utility::sorted_boxes(std::vector<OCRPredictResult> &ocr_result) {
   std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box);
   if (ocr_result.size() > 0) {
     for (int i = 0; i < ocr_result.size() - 1; i++) {
-      for (int j = i; j > 0; j--) {
+      for (int j = i; j >= 0; j--) {
         if (abs(ocr_result[j + 1].box[0][1] - ocr_result[j].box[0][1]) < 10 &&
             (ocr_result[j + 1].box[0][0] < ocr_result[j].box[0][0])) {
           std::swap(ocr_result[i], ocr_result[i + 1]);

+ 1 - 1
deploy/hubserving/ocr_det/module.py

@@ -122,7 +122,7 @@ class OCRDet(hub.Module):
             rec_res_final = []
             for dno in range(len(dt_boxes)):
                 rec_res_final.append({
-                    'text_region': dt_boxes[dno].astype(np.int).tolist()
+                    'text_region': dt_boxes[dno].astype(np.int32).tolist()
                 })
             all_results.append(rec_res_final)
         return all_results

+ 1 - 1
deploy/hubserving/ocr_system/module.py

@@ -130,7 +130,7 @@ class OCRSystem(hub.Module):
                 rec_res_final.append({
                     'text': text,
                     'confidence': float(score),
-                    'text_region': dt_boxes[dno].astype(np.int).tolist()
+                    'text_region': dt_boxes[dno].astype(np.int32).tolist()
                 })
             all_results.append(rec_res_final)
         return all_results

+ 1 - 5
deploy/paddlejs/README.md

@@ -5,7 +5,7 @@ English| [简体中文](README_ch.md)
 [Paddle.js](https://github.com/PaddlePaddle/Paddle.js) is a web project for Baidu PaddlePaddle, which is an open source deep learning framework running in the browser. Paddle.js can either load a pre-trained model, or transforming a model from paddle-hub with model transforming tools provided by Paddle.js. It could run in every browser with WebGL/WebGPU/WebAssembly supported. It could also run in Baidu Smartprogram and wechat miniprogram.
 
 ## Web Demo
-Run OCR demo in browser refer to [tutorial](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo).
+Run OCR demo in browser refer to [tutorial](https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/application/js/WebDemo.md).
 
 |demo|web demo dicrctory|visualization|
 |-|-|-|
@@ -24,7 +24,3 @@ Run OCR demo in wechat miniprogram refer to [tutorial](https://github.com/Paddle
 <div align="center">
     <img src="./paddlejs_demo.gif" width="800">
 </div>
-
-<a href="https://trackgit.com">
-<img src="https://us-central1-trackgit-analytics.cloudfunctions.net/token/ping/lb0jygcawaxcrq8cb8rl" alt="trackgit-views" />
-</a>

+ 1 - 5
deploy/paddlejs/README_ch.md

@@ -7,7 +7,7 @@
 
 ## Web Demo使用
 
-在浏览器中直接运行官方OCR demo参考[教程](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo)
+在浏览器中直接运行官方OCR demo参考[教程](https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/application/js/WebDemo.md)
 
 |demo名称|web demo目录|可视化|
 |-|-|-|
@@ -29,7 +29,3 @@
 <div align="center">
     <img src="https://user-images.githubusercontent.com/26592129/197918203-c7d46f8a-75d4-47f9-9687-405ee0d6727e.gif" width="800">
 </div>
-
-<a href="https://trackgit.com">
-<img src="https://us-central1-trackgit-analytics.cloudfunctions.net/token/ping/lb0jzfbyyttdxne1imal" alt="trackgit-views" />
-</a>

+ 1 - 0
deploy/slim/quantization/README.md

@@ -54,6 +54,7 @@ python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP
 ### 5. 量化模型部署
 
 上述步骤导出的量化模型,参数精度仍然是FP32,但是参数的数值范围是int8,导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。
+
 量化模型移动端部署的可参考 [移动端模型部署](../../lite/readme.md)
 
 备注:量化训练后的模型参数是float32类型,转inference model预测时相对不量化无加速效果,原因是量化后模型结构之间存在量化和反量化算子,如果要使用量化模型部署,建议使用TensorRT并设置precision为INT8加速量化模型的预测时间。

+ 0 - 2
doc/doc_ch/PP-OCRv3_introduction.md

@@ -63,8 +63,6 @@ PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.p
 
 测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。
 
-PP-OCRv3检测模型训练步骤参考[文档](./PP-OCRv3_det_train.md)
-
 **(1)LK-PAN:大感受野的PAN结构**
 
 LK-PAN (Large Kernel PAN) 是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构,核心是将PAN结构的path augmentation中卷积核从`3*3`改为`9*9`。通过增大卷积核,提升特征图每个位置覆盖的感受野,更容易检测大字体的文字以及极端长宽比的文字。使用LK-PAN结构,可以将教师模型的hmean从83.2%提升到85.0%。

+ 0 - 2
doc/doc_ch/PP-OCRv3_det_train.md

@@ -1,5 +1,3 @@
-[English](../doc_en/PP-OCRv3_det_train_en.md) | 简体中文
-
 
 # PP-OCRv3 文本检测模型训练
 

+ 0 - 2
doc/doc_ch/algorithm_det_east.md

@@ -30,8 +30,6 @@
 |EAST|MobileNetV3|[det_mv3_east.yml](../../configs/det/det_mv3_east.yml) | 78.20%|    79.10%|    78.65%|    [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)|
 
 
-
-
 <a name="2"></a>
 ## 2. 环境配置
 请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。

+ 2 - 2
doc/doc_ch/algorithm_det_sast.md

@@ -73,9 +73,9 @@ python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Gl
 
 ```
 
-SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,可以执行如下命令:
+SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_box_type=poly`,可以执行如下命令:
 ```
-python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True
+python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_box_type='poly'
 ```
 可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下:
 

+ 64 - 0
doc/doc_ch/finetune.md

@@ -103,6 +103,66 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*
 
 更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](./models_list.md)。
 
+PP-OCRv3 模型使用了GTC策略,其中SAR分支参数量大,当训练数据为简单场景时模型容易过拟合,导致微调效果不佳,建议去除GTC策略,模型结构部分配置文件修改如下:
+
+```yaml
+Architecture:
+  model_type: rec
+  algorithm: SVTR
+  Transform:
+  Backbone:
+    name: MobileNetV1Enhance
+    scale: 0.5
+    last_conv_stride: [1, 2]
+    last_pool_type: avg
+  Neck:
+    name: SequenceEncoder
+    encoder_type: svtr
+    dims: 64
+    depth: 2
+    hidden_dims: 120
+    use_guide: False
+  Head:
+    name: CTCHead
+    fc_decay: 0.00001
+Loss:
+  name: CTCLoss
+
+Train:
+  dataset:
+  ......
+    transforms:
+    # 去除 RecConAug 增广
+    # - RecConAug:
+    #     prob: 0.5
+    #     ext_data_num: 2
+    #     image_shape: [48, 320, 3]
+    #     max_text_length: *max_text_length
+    - RecAug:
+    # 修改 Encode 方式
+    - CTCLabelEncode:
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+...
+
+Eval:
+  dataset:
+  ...
+    transforms:
+    ...
+    - CTCLabelEncode:
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+...
+
+
+```
 
 ### 3.3 训练超参选择
 
@@ -136,6 +196,7 @@ Train:
 
 ```
 
+
 上述配置文件中,首先需要将`pretrained_model`字段指定为3.2章节中解压得到的`ch_PP-OCRv3_rec_train/best_accuracy.pdparams`文件路径。
 
 PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*128=1024`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如:
@@ -162,6 +223,9 @@ Train:
     ratio_list: [1.0, 0.1]
 ```
 
+
 ### 3.4 训练调优
 
 训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。通过多次迭代训练,不断优化模型效果。
+
+如果在训练时修改了自定义字典,由于无法加载最后一层FC的参数,在迭代初期acc=0是正常的情况,不必担心,加载预训练模型依然可以加快模型收敛。

+ 2 - 2
doc/doc_ch/inference_args.md

@@ -70,7 +70,7 @@ SAST算法相关参数如下
 | :--: | :--: | :--: | :--: |
 |  det_sast_score_thresh | float | 0.5 | SAST后处理中的得分阈值 |
 |  det_sast_nms_thresh | float | 0.5 | SAST后处理中nms的阈值 |
-|  det_sast_polygon | bool | False | 是否多边形检测,弯曲文本场景(如Total-Text)设置为True |
+|  det_box_type | str | quad | 是否多边形检测,弯曲文本场景(如Total-Text)设置为'poly' |
 
 PSE算法相关参数如下
 
@@ -79,7 +79,7 @@ PSE算法相关参数如下
 |  det_pse_thresh | float | 0.0 | 对输出图做二值化的阈值 |
 |  det_pse_box_thresh | float | 0.85 | 对box进行过滤的阈值,低于此阈值的丢弃 |
 |  det_pse_min_area | float | 16 | box的最小面积,低于此阈值的丢弃 |
-|  det_pse_box_type | str | "box" | 返回框的类型,box:四点坐标,poly: 弯曲文本的所有点坐标 |
+|  det_box_type | str | "quad" | 返回框的类型,quad:四点坐标,poly: 弯曲文本的所有点坐标 |
 |  det_pse_scale | int | 1 | 输入图像相对于进后处理的图的比例,如`640*640`的图像,网络输出为`160*160`,scale为2的情况下,进后处理的图片shape为`320*320`。这个值调大可以加快后处理速度,但是会带来精度的下降 |
 
 * 文本识别模型相关

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 2 - 2
doc/doc_ch/inference_ppocr.md


+ 1 - 2
doc/doc_ch/quickstart.md

@@ -176,14 +176,13 @@ for idx in range(len(result)):
         print(line)
 
 # 显示结果
-# 如果本地没有simfang.ttf,可以在doc/fonts目录下下载
 from PIL import Image
 result = result[0]
 image = Image.open(img_path).convert('RGB')
 boxes = [line[0] for line in result]
 txts = [line[1][0] for line in result]
 scores = [line[1][1] for line in result]
-im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
+im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
 im_show = Image.fromarray(im_show)
 im_show.save('result.jpg')
 ```

+ 11 - 6
doc/doc_ch/table_recognition.md

@@ -6,6 +6,7 @@
   - [1.1. 数据集格式](#11-数据集格式)
   - [1.2. 数据下载](#12-数据下载)
   - [1.3. 数据集生成](#13-数据集生成)
+  - [1.4 数据标注](#14-数据标注)
 - [2. 开始训练](#2-开始训练)
   - [2.1. 启动训练](#21-启动训练)
   - [2.2. 断点训练](#22-断点训练)
@@ -39,15 +40,15 @@ img_label
 每一行的json格式为:
 ```txt
 {
-   'filename': PMC5755158_010_01.png,							# 图像名
-   'split': ’train‘, 									# 图像属于训练集还是验证集
-   'imgid': 0,								 		# 图像的index
+   'filename': PMC5755158_010_01.png,                            # 图像名
+   'split': ’train‘,                                     # 图像属于训练集还是验证集
+   'imgid': 0,                                         # 图像的index
    'html': {
-     'structure': {'tokens': ['<thead>', '<tr>', '<td>', ...]}, 			# 表格的HTML字符串
+     'structure': {'tokens': ['<thead>', '<tr>', '<td>', ...]},             # 表格的HTML字符串
      'cells': [
        {
-         'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], 	# 表格中的单个文本
-         'bbox': [x0, y0, x1, y1]  							# 表格中的单个文本的坐标
+         'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'],     # 表格中的单个文本
+         'bbox': [x0, y0, x1, y1]                              # 表格中的单个文本的坐标
        }
      ]
    }
@@ -78,6 +79,10 @@ TableGeneration是一个开源表格数据集生成工具,其通过浏览器
 |简单表格|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/simple.jpg)|
 |彩色表格|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/color.jpg)|
 
+## 1.4 数据标注
+
+数据标注可参考[PPOCRLabel](../../PPOCRLabel/README_ch.md)
+
 # 2. 开始训练
 
 PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 [SLANet](../../configs/table/SLANet.yml) 模型训练PubTabNet英文数据集为例:

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 253
doc/doc_en/PP-OCRv3_det_train_en.md


+ 0 - 1
doc/doc_en/PP-OCRv3_introduction_en.md

@@ -65,7 +65,6 @@ The ablation experiments are as follows:
 
 Testing environment: Intel Gold 6148 CPU, with MKLDNN acceleration enabled during inference.
 
-The training steps of PP-OCRv3 detection model refer to [tutorial](./PP-OCRv3_det_train_en.md)
 
 **(1) LK-PAN: A PAN structure with large receptive field**
 

+ 1 - 1
doc/doc_en/algorithm_det_db_en.md

@@ -31,7 +31,7 @@ On the ICDAR2015 dataset, the text detection result is as follows:
 | --- | --- | --- | --- | --- | --- | --- |
 |DB|ResNet50_vd|[configs/det/det_r50_vd_db.yml](../../configs/det/det_r50_vd_db.yml)|86.41%|78.72%|82.38%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)|
 |DB|MobileNetV3|[configs/det/det_mv3_db.yml](../../configs/det/det_mv3_db.yml)|77.29%|73.08%|75.12%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)|
-|DB++|ResNet50|[configs/det/det_r50_db++_icdar15.yml](../../configs/det/det_r50_db++_icdar15.yml)|90.89%|82.66%|86.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)|
+|DB++|ResNet50|[configs/det/det_r50_db++_ic15.yml](../../configs/det/det_r50_db++_ic15.yml)|90.89%|82.66%|86.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)|
 
 On the TD_TR dataset, the text detection result is as follows:
 

+ 2 - 2
doc/doc_en/algorithm_det_sast_en.md

@@ -74,10 +74,10 @@ First, convert the model saved in the SAST text detection training process into
 python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy  Global.save_inference_dir=./inference/det_sast_tt
 ```
 
-For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_sast_polygon=True`, run the following command:
+For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_box_type=poly`, run the following command:
 
 ```
-python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True
+python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_box_type='poly'
 ```
 
 The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows:

+ 1 - 1
doc/doc_en/algorithm_overview_en.md

@@ -99,7 +99,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
 |SAR|Resnet31| 87.20% | rec_r31_sar | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
 |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
 |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) |
-|ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce | [trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar) |
+|ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce | [trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) |
 |ABINet|Resnet45| 90.75% | rec_r45_abinet | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) |
 |VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar) |
 |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) |

+ 1 - 1
doc/doc_en/algorithm_rec_vitstr_en.md

@@ -25,7 +25,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
 
 |Model|Backbone|config|Acc|Download link|
 | --- | --- | --- | --- | --- |
-|ViTSTR|ViTSTR|[rec_vitstr_none_ce.yml](../../configs/rec/rec_vitstr_none_ce.yml)|79.82%|[trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar)|
+|ViTSTR|ViTSTR|[rec_vitstr_none_ce.yml](../../configs/rec/rec_vitstr_none_ce.yml)|79.82%|[trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)|
 
 <a name="2"></a>
 ## 2. Environment

+ 62 - 0
doc/doc_en/finetune_en.md

@@ -103,6 +103,66 @@ It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3
 
 For more PP-OCR series models, please refer to [PP-OCR Series Model Library](./models_list_en.md)。
 
+The PP-OCRv3 model uses the GTC strategy. The SAR branch has a large number of parameters. When the training data is a simple scene, the model is easy to overfit, resulting in poor fine-tuning effect. It is recommended to remove the GTC strategy. The configuration file of the model structure is modified as follows:
+
+```yaml
+Architecture:
+  model_type: rec
+  algorithm: SVTR
+  Transform:
+  Backbone:
+    name: MobileNetV1Enhance
+    scale: 0.5
+    last_conv_stride: [1, 2]
+    last_pool_type: avg
+  Neck:
+    name: SequenceEncoder
+    encoder_type: svtr
+    dims: 64
+    depth: 2
+    hidden_dims: 120
+    use_guide: False
+  Head:
+    name: CTCHead
+    fc_decay: 0.00001
+Loss:
+  name: CTCLoss
+
+Train:
+  dataset:
+  ......
+    transforms:
+    # remove RecConAug
+    # - RecConAug:
+    #     prob: 0.5
+    #     ext_data_num: 2
+    #     image_shape: [48, 320, 3]
+    #     max_text_length: *max_text_length
+    - RecAug:
+    # modify Encode
+    - CTCLabelEncode:
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+...
+
+Eval:
+  dataset:
+  ...
+    transforms:
+    ...
+    - CTCLabelEncode:
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label
+        - length
+...
+
+
+```
 
 ### 3.3 Training hyperparameter
 
@@ -165,3 +225,5 @@ Train:
 ### 3.4 training optimization
 
 The training process does not happen overnight. After completing a stage of training evaluation, it is recommended to collect and analyze the badcase of the current model in the real scene, adjust the proportion of training data in a targeted manner, or further add synthetic data. Through multiple iterations of training, the model effect is continuously optimized.
+
+If you modify the custom dictionary during training, since the parameters of the last layer of FC cannot be loaded, it is normal for acc=0 at the beginning of the iteration. Don't worry, loading the pre-trained model can still speed up the model convergence.

+ 2 - 2
doc/doc_en/inference_args_en.md

@@ -70,7 +70,7 @@ The relevant parameters of the SAST algorithm are as follows
 | :--: | :--: | :--: | :--: |
 |  det_sast_score_thresh | float | 0.5 | Score thresholds in SAST postprocess |
 |  det_sast_nms_thresh | float | 0.5 | Thresholding of nms in SAST postprocess |
-|  det_sast_polygon | bool | False | Whether polygon detection, curved text scene (such as Total-Text) is set to True |
+|  det_box_type | str | 'quad' | Whether polygon detection, curved text scene (such as Total-Text) is set to 'poly' |
 
 The relevant parameters of the PSE algorithm are as follows
 
@@ -79,7 +79,7 @@ The relevant parameters of the PSE algorithm are as follows
 |  det_pse_thresh | float | 0.0 | Threshold for binarizing the output image |
 |  det_pse_box_thresh | float | 0.85 | Threshold for filtering boxes, below this threshold is discarded |
 |  det_pse_min_area | float | 16 | The minimum area of the box, below this threshold is discarded |
-|  det_pse_box_type | str | "box" | The type of the returned box, box: four point coordinates, poly: all point coordinates of the curved text |
+|  det_box_type | str | "quad" | The type of the returned box, quad: four point coordinates, poly: all point coordinates of the curved text |
 |  det_pse_scale | int | 1 | The ratio of the input image relative to the post-processed image, such as an image of `640*640`, the network output is `160*160`, and when the scale is 2, the shape of the post-processed image is `320*320`. Increasing this value can speed up the post-processing speed, but it will bring about a decrease in accuracy |
 
 * Text recognition model related parameters

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 23 - 25
doc/doc_en/inference_en.md


+ 1 - 0
doc/doc_en/knowledge_distillation_en.md

@@ -228,6 +228,7 @@ Architecture:
               enc_dim: 512
               max_text_length: *max_text_length
 ```
+```
 
 When the model is finally trained, it contains 3 sub-networks: `Teacher`, `Student`, `Student2`.
 

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 1 - 1
doc/doc_en/models_list_en.md


+ 5 - 0
doc/doc_en/table_recognition_en.md

@@ -6,6 +6,7 @@ This article provides a full-process guide for the PaddleOCR table recognition m
   - [1.1. DataSet Format](#11-dataset-format)
   - [1.2. Data Download](#12-data-download)
   - [1.3. Dataset Generation](#13-dataset-generation)
+  - [1.4 Data annotation](#14-data-annotation)
 - [2. Training](#2-training)
   - [2.1. Start Training](#21-start-training)
   - [2.2. Resume Training](#22-resume-training)
@@ -80,6 +81,10 @@ Some samples are as follows:
 |Simple Table|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/simple.jpg)|
 |Simple Color Table|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/color.jpg)|
 
+## 1.4 Data annotation
+
+Data annotation can refer to[PPOCRLabel](../../PPOCRLabel/README.md)
+
 # 2. Training
 
 PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. In this section, the [SLANet](../../configs/table/SLANet.yml) model will be used as an example:

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 1 - 1
doc/doc_i18n/README_日本語.md


+ 22 - 1
paddleocr.py

@@ -26,6 +26,9 @@ import cv2
 import logging
 import numpy as np
 from pathlib import Path
+import base64
+from io import BytesIO
+from PIL import Image
 
 tools = importlib.import_module('.', 'tools')
 ppocr = importlib.import_module('.', 'ppocr')
@@ -431,7 +434,25 @@ def check_img(img):
         img, flag_gif, flag_pdf = check_and_read(image_file)
         if not flag_gif and not flag_pdf:
             with open(image_file, 'rb') as f:
-                img = img_decode(f.read())
+                img_str = f.read()
+                img = img_decode(img_str)
+            if img is None:
+                try:
+                    buf = BytesIO()
+                    image = BytesIO(img_str)
+                    im = Image.open(image)
+                    rgb = im.convert('RGB')
+                    rgb.save(buf, 'jpeg')
+                    buf.seek(0)
+                    image_bytes = buf.read()
+                    data_base64 = str(base64.b64encode(image_bytes),
+                                      encoding="utf-8")
+                    image_decode = base64.b64decode(data_base64)
+                    img_array = np.frombuffer(image_decode, np.uint8)
+                    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+                except:
+                    logger.error("error in loading image:{}".format(image_file))
+                    return None
         if img is None:
             logger.error("error in loading image:{}".format(image_file))
             return None

+ 7 - 6
ppocr/data/imaug/abinet_aug.py

@@ -205,7 +205,7 @@ class CVRandomAffine(object):
                      for x, y in startpoints]
 
         rect = cv2.minAreaRect(np.array(endpoints))
-        bbox = cv2.boxPoints(rect).astype(dtype=np.int)
+        bbox = cv2.boxPoints(rect).astype(dtype=np.int32)
         max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max()
         min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min()
 
@@ -234,9 +234,9 @@ class CVRandomPerspective(object):
 
     def get_params(self, width, height, distortion):
         offset_h = sample_asym(
-            distortion * height / 2, size=4).astype(dtype=np.int)
+            distortion * height / 2, size=4).astype(dtype=np.int32)
         offset_w = sample_asym(
-            distortion * width / 2, size=4).astype(dtype=np.int)
+            distortion * width / 2, size=4).astype(dtype=np.int32)
         topleft = (offset_w[0], offset_h[0])
         topright = (width - 1 - offset_w[1], offset_h[1])
         botright = (width - 1 - offset_w[2], height - 1 - offset_h[2])
@@ -256,7 +256,7 @@ class CVRandomPerspective(object):
 
         # TODO: more robust way to crop image
         rect = cv2.minAreaRect(endpoints)
-        bbox = cv2.boxPoints(rect).astype(dtype=np.int)
+        bbox = cv2.boxPoints(rect).astype(dtype=np.int32)
         max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max()
         min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min()
         min_x, min_y = max(min_x, 0), max(min_y, 0)
@@ -441,7 +441,8 @@ class SVTRGeometry(object):
         self.p = p
         self.transforms = []
         self.transforms.append(CVRandomRotation(degrees=degrees))
-        self.transforms.append(CVRandomAffine(
+        self.transforms.append(
+            CVRandomAffine(
                 degrees=degrees, translate=translate, scale=scale, shear=shear))
         self.transforms.append(CVRandomPerspective(distortion=distortion))
 
@@ -455,4 +456,4 @@ class SVTRGeometry(object):
                 img = self.transforms[random.randint(0, 2)](img)
             return img
         else:
-            return img
+            return img

+ 4 - 1
ppocr/data/imaug/ct_process.py

@@ -19,7 +19,8 @@ import pyclipper
 import paddle
 
 import numpy as np
-import Polygon as plg
+from ppocr.utils.utility import check_install
+
 import scipy.io as scio
 
 from PIL import Image
@@ -70,6 +71,8 @@ class MakeShrink():
         return peri
 
     def shrink(self, bboxes, rate, max_shr=20):
+        check_install('Polygon', 'Polygon3')
+        import Polygon as plg
         rate = rate * rate
         shrinked_bboxes = []
         for bbox in bboxes:

+ 3 - 1
ppocr/data/imaug/drrg_targets.py

@@ -18,7 +18,7 @@ https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_t
 
 import cv2
 import numpy as np
-from lanms import merge_quadrangle_n9 as la_nms
+from ppocr.utils.utility import check_install
 from numpy.linalg import norm
 
 
@@ -543,6 +543,8 @@ class DRRGTargets(object):
 
         score = np.ones((text_comps.shape[0], 1), dtype=np.float32)
         text_comps = np.hstack([text_comps, score])
+        check_install('lanms', 'lanms-neo')
+        from lanms import merge_quadrangle_n9 as la_nms
         text_comps = la_nms(text_comps, self.text_comp_nms_thr)
 
         if text_comps.shape[0] >= 1:

+ 1 - 1
ppocr/data/imaug/fce_aug.py

@@ -208,7 +208,7 @@ class RandomCropFlip:
         for polygon in all_polys:
             rect = cv2.minAreaRect(polygon.astype(np.int32).reshape(-1, 2))
             box = cv2.boxPoints(rect)
-            box = np.int0(box)
+            box = np.int64(box)
             text_polys.append([box[0], box[1], box[2], box[3]])
 
         polys = np.array(text_polys, dtype=np.int32)

+ 10 - 7
ppocr/data/imaug/fce_targets.py

@@ -22,10 +22,12 @@ from numpy.fft import fft
 from numpy.linalg import norm
 import sys
 
+
 def vector_slope(vec):
     assert len(vec) == 2
     return abs(vec[1] / (vec[0] + 1e-8))
 
+
 class FCENetTargets:
     """Generate the ground truth targets of FCENet: Fourier Contour Embedding
     for Arbitrary-Shaped Text Detection.
@@ -107,7 +109,9 @@ class FCENetTargets:
         for i in range(1, n):
             current_line_len = i * delta_length
 
-            while current_edge_ind + 1 < len(length_cumsum) and current_line_len >= length_cumsum[current_edge_ind + 1]:
+            while current_edge_ind + 1 < len(
+                    length_cumsum) and current_line_len >= length_cumsum[
+                        current_edge_ind + 1]:
                 current_edge_ind += 1
 
             current_edge_end_shift = current_line_len - length_cumsum[
@@ -239,10 +243,9 @@ class FCENetTargets:
             head_inds = [head_start, head_end]
             tail_inds = [tail_start, tail_end]
         else:
-            if vector_slope(points[1] - points[0]) + vector_slope(
-                    points[3] - points[2]) < vector_slope(points[
-                        2] - points[1]) + vector_slope(points[0] - points[
-                            3]):
+            if vector_slope(points[1] - points[0]) + vector_slope(points[
+                    3] - points[2]) < vector_slope(points[2] - points[
+                        1]) + vector_slope(points[0] - points[3]):
                 horizontal_edge_inds = [[0, 1], [2, 3]]
                 vertical_edge_inds = [[3, 0], [1, 2]]
             else:
@@ -582,7 +585,7 @@ class FCENetTargets:
         lv_ignore_polys = [[] for i in range(len(lv_size_divs))]
         level_maps = []
         for poly in text_polys:
-            polygon = np.array(poly, dtype=np.int).reshape((1, -1, 2))
+            polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2))
             _, _, box_w, box_h = cv2.boundingRect(polygon)
             proportion = max(box_h, box_w) / (h + 1e-8)
 
@@ -591,7 +594,7 @@ class FCENetTargets:
                     lv_text_polys[ind].append(poly / lv_size_divs[ind])
 
         for ignore_poly in ignore_polys:
-            polygon = np.array(ignore_poly, dtype=np.int).reshape((1, -1, 2))
+            polygon = np.array(ignore_poly, dtype=np.int32).reshape((1, -1, 2))
             _, _, box_w, box_h = cv2.boundingRect(polygon)
             proportion = max(box_h, box_w) / (h + 1e-8)
 

+ 3 - 3
ppocr/data/imaug/label_ops.py

@@ -64,7 +64,7 @@ class DetLabelEncode(object):
             return None
         boxes = self.expand_points_num(boxes)
         boxes = np.array(boxes, dtype=np.float32)
-        txt_tags = np.array(txt_tags, dtype=np.bool)
+        txt_tags = np.array(txt_tags, dtype=np.bool_)
 
         data['polys'] = boxes
         data['texts'] = txts
@@ -218,7 +218,7 @@ class E2ELabelEncodeTest(BaseRecLabelEncode):
             else:
                 txt_tags.append(False)
         boxes = np.array(boxes, dtype=np.float32)
-        txt_tags = np.array(txt_tags, dtype=np.bool)
+        txt_tags = np.array(txt_tags, dtype=np.bool_)
         data['polys'] = boxes
         data['ignore_tags'] = txt_tags
         temp_texts = []
@@ -254,7 +254,7 @@ class E2ELabelEncodeTrain(object):
             else:
                 txt_tags.append(False)
         boxes = np.array(boxes, dtype=np.float32)
-        txt_tags = np.array(txt_tags, dtype=np.bool)
+        txt_tags = np.array(txt_tags, dtype=np.bool_)
 
         data['polys'] = boxes
         data['texts'] = txts

+ 3 - 11
ppocr/modeling/heads/det_db_head.py

@@ -31,7 +31,7 @@ def get_bias_attr(k):
 
 
 class Head(nn.Layer):
-    def __init__(self, in_channels, name_list, kernel_list=[3, 2, 2], **kwargs):
+    def __init__(self, in_channels, kernel_list=[3, 2, 2], **kwargs):
         super(Head, self).__init__()
 
         self.conv1 = nn.Conv2D(
@@ -93,16 +93,8 @@ class DBHead(nn.Layer):
     def __init__(self, in_channels, k=50, **kwargs):
         super(DBHead, self).__init__()
         self.k = k
-        binarize_name_list = [
-            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
-            'conv2d_transpose_1', 'binarize'
-        ]
-        thresh_name_list = [
-            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
-            'conv2d_transpose_3', 'thresh'
-        ]
-        self.binarize = Head(in_channels, binarize_name_list, **kwargs)
-        self.thresh = Head(in_channels, thresh_name_list, **kwargs)
+        self.binarize = Head(in_channels, **kwargs)
+        self.thresh = Head(in_channels, **kwargs)
 
     def step_function(self, x, y):
         return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))

+ 1 - 1
ppocr/modeling/heads/proposal_local_graph.py

@@ -40,7 +40,7 @@ def fill_hole(input_mask):
     mask = np.zeros((h + 4, w + 4), np.uint8)
 
     cv2.floodFill(canvas, mask, (0, 0), 1)
-    canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool)
+    canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool_)
 
     return ~canvas | input_mask
 

+ 5 - 0
ppocr/modeling/heads/rec_nrtr_head.py

@@ -17,6 +17,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F
 from paddle.nn import LayerList
+# from paddle.nn.initializer import XavierNormal as xavier_uniform_
 from paddle.nn import Dropout, Linear, LayerNorm
 import numpy as np
 from ppocr.modeling.backbones.rec_svtrnet import Mlp, zeros_, ones_
@@ -29,6 +30,7 @@ class Transformer(nn.Layer):
     Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
     Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
     Processing Systems, pages 6000-6010.
+
     Args:
         d_model: the number of expected features in the encoder/decoder inputs (default=512).
         nhead: the number of heads in the multiheadattention models (default=8).
@@ -346,12 +348,15 @@ class MultiheadAttention(nn.Layer):
     """Allows the model to jointly attend to information
     from different representation subspaces.
     See reference: Attention Is All You Need
+
     .. math::
         \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
         \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+
     Args:
         embed_dim: total dimension of the model
         num_heads: parallel attention layers, or heads
+
     """
 
     def __init__(self, embed_dim, num_heads, dropout=0., self_attn=False):

+ 3 - 4
ppocr/postprocess/drrg_postprocess.py

@@ -68,7 +68,7 @@ def graph_propagation(edges, scores, text_comps, edge_len_thr=50.):
             score_dict[edge[0], edge[1]] = scores[i]
 
     nodes = np.sort(np.unique(edges.flatten()))
-    mapping = -1 * np.ones((np.max(nodes) + 1), dtype=np.int)
+    mapping = -1 * np.ones((np.max(nodes) + 1), dtype=np.int32)
     mapping[nodes] = np.arange(nodes.shape[0])
     order_inds = mapping[edges]
     vertices = [Node(node) for node in nodes]
@@ -93,9 +93,8 @@ def connected_components(nodes, score_dict, link_thr):
         while node_queue:
             node = node_queue.pop(0)
             neighbors = set([
-                neighbor for neighbor in node.links
-                if score_dict[tuple(sorted([node.ind, neighbor.ind]))] >=
-                link_thr
+                neighbor for neighbor in node.links if
+                score_dict[tuple(sorted([node.ind, neighbor.ind]))] >= link_thr
             ])
             neighbors.difference_update(cluster)
             nodes.difference_update(neighbors)

+ 3 - 2
ppocr/postprocess/east_postprocess.py

@@ -22,6 +22,7 @@ import cv2
 import paddle
 
 import os
+from ppocr.utils.utility import check_install
 import sys
 
 
@@ -78,11 +79,11 @@ class EASTPostProcess(object):
         boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
 
         try:
+            check_install('lanms', 'lanms-nova')
             import lanms
-            boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
         except:
             print(
-                'you should install lanms by pip3 install lanms-nova to speed up nms_locality'
+                'You should install lanms by pip3 install lanms-nova to speed up nms_locality'
             )
             boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
         if boxes.shape[0] == 0:

+ 2 - 2
ppocr/postprocess/fce_postprocess.py

@@ -31,7 +31,7 @@ def fill_hole(input_mask):
     mask = np.zeros((h + 4, w + 4), np.uint8)
 
     cv2.floodFill(canvas, mask, (0, 0), 1)
-    canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool)
+    canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool_)
 
     return ~canvas | input_mask
 
@@ -234,7 +234,7 @@ class FCEPostProcess(object):
                 poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32)
                 score = boundary[-1]
                 points = cv2.boxPoints(cv2.minAreaRect(poly))
-                points = np.int0(points)
+                points = np.int64(points)
                 new_boundaries.append(points.reshape(-1).tolist() + [score])
                 boundaries = new_boundaries
 

+ 1 - 1
ppocr/postprocess/rec_postprocess.py

@@ -891,7 +891,7 @@ class VLLabelDecode(BaseRecLabelDecode):
             ) + length[i])].topk(1)[0][:, 0]
             preds_prob = paddle.exp(
                 paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6))
-            text.append((preds_text, preds_prob.numpy()[0]))
+            text.append((preds_text, float(preds_prob)))
         if label is None:
             return text
         label = self.decode(label)

+ 2 - 0
ppocr/postprocess/sast_postprocess.py

@@ -141,6 +141,8 @@ class SASTPostProcess(object):
 
     def nms(self, dets):
         if self.is_python35:
+            from ppocr.utils.utility import check_install
+            check_install('lanms', 'lanms-nova')
             import lanms
             dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh)
         else:

+ 5 - 1
ppocr/utils/e2e_metric/Deteval.py

@@ -15,7 +15,9 @@
 import json
 import numpy as np
 import scipy.io as io
-import Polygon as plg
+
+from ppocr.utils.utility import check_install
+
 from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area
 
 
@@ -275,6 +277,8 @@ def get_score_C(gt_label, text, pred_bboxes):
     """
     get score for CentripetalText (CT) prediction.
     """
+    check_install("Polygon", "Polygon3")
+    import Polygon as plg
 
     def gt_reading_mod(gt_label, text):
         """This helper reads groundtruths from mat files"""

+ 1 - 1
ppocr/utils/gen_label.py

@@ -29,7 +29,7 @@ def gen_rec_label(input_path, out_label):
 def gen_det_label(root_path, input_dir, out_label):
     with open(out_label, 'w') as out_file:
         for label_file in os.listdir(input_dir):
-            img_path = root_path + label_file[3:-4] + ".jpg"
+            img_path = os.path.join(root_path, label_file[3:-4] + ".jpg")
             label = []
             with open(
                     os.path.join(input_dir, label_file), 'r',

+ 23 - 0
ppocr/utils/utility.py

@@ -19,6 +19,9 @@ import cv2
 import random
 import numpy as np
 import paddle
+import importlib.util
+import sys
+import subprocess
 
 
 def print_dict(d, logger, delimiter=0):
@@ -131,6 +134,26 @@ def set_seed(seed=1024):
     paddle.seed(seed)
 
 
+def check_install(module_name, install_name):
+    spec = importlib.util.find_spec(module_name)
+    if spec is None:
+        print(f'Warnning! The {module_name} module is NOT installed')
+        print(
+            f'Try install {module_name} module automatically. You can also try to install manually by pip install {install_name}.'
+        )
+        python = sys.executable
+        try:
+            subprocess.check_call(
+                [python, '-m', 'pip', 'install', install_name],
+                stdout=subprocess.DEVNULL)
+            print(f'The {module_name} module is now installed')
+        except subprocess.CalledProcessError as exc:
+            raise Exception(
+                f"Install {module_name} failed, please install manually")
+    else:
+        print(f"{module_name} has been installed.")
+
+
 class AverageMeter:
     def __init__(self):
         self.reset()

+ 13 - 0
ppstructure/docs/quickstart.md

@@ -104,6 +104,19 @@ paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --rec
 
 通过OCR技术:
 
+版面恢复分为2种方法,详细介绍请参考:[版面恢复教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/recovery/README_ch.md):
+
+- PDF解析
+- OCR技术
+
+通过PDF解析(只支持pdf格式的输入):
+
+```bash
+paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true
+```
+
+通过OCR技术:
+
 ```bash
 # 中文测试图
 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true

+ 7 - 0
ppstructure/kie/README.md

@@ -186,6 +186,10 @@ python3 ./tools/infer_kie_token_ser_re.py \
 
 The visual result images and the predicted text file will be saved in the `Global.save_res_path` directory.
 
+If you want to use a custom ocr model, you can set it through the following fields
+- `Global.kie_det_model_dir`: the detection inference model path
+- `Global.kie_rec_model_dir`: the recognition inference model path
+
 
 If you want to load the text detection and recognition results collected before, you can use the following command to predict.
 
@@ -257,6 +261,9 @@ python3 kie/predict_kie_token_ser_re.py \
 
 The visual results and text file will be saved in directory `output`.
 
+If you want to use a custom ocr model, you can set it through the following fields
+- `--det_model_dir`: the detection inference model path
+- `--rec_model_dir`: the recognition inference model path
 
 ### 4.3 More
 

+ 7 - 0
ppstructure/kie/README_ch.md

@@ -170,6 +170,10 @@ python3 ./tools/infer_kie_token_ser_re.py \
 
 `Global.save_res_path`目录中会保存可视化的结果图像以及预测的文本文件。
 
+如果想使用自定义OCR模型,可通过如下字段进行设置
+- `Global.kie_det_model_dir`: 设置检测inference模型地址
+- `Global.kie_rec_model_dir`: 设置识别inference模型地址
+
 
 如果希望加载标注好的文本检测与识别结果,仅预测可以使用下面的命令进行预测。
 
@@ -239,6 +243,9 @@ python3 kie/predict_kie_token_ser_re.py \
 
 可视化结果保存在`output`目录下。
 
+如果想使用自定义OCR模型,可通过如下字段进行设置
+- `--det_model_dir`: 设置检测inference模型地址
+- `--rec_model_dir`: 设置识别inference模型地址
 
 ### 4.3 更多
 

+ 1 - 1
ppstructure/kie/requirements.txt

@@ -2,6 +2,6 @@ sentencepiece
 yacs
 seqeval
 pypandoc
-attrdict
+attrdict3
 python_docx
 paddlenlp>=2.4.1

+ 1 - 1
ppstructure/table/predict_table.py

@@ -93,7 +93,7 @@ class TableSystem(object):
         time_dict['rec'] = rec_elapse
 
         if return_ocr_result_in_table:
-            result['boxes'] = dt_boxes  #[x.tolist() for x in dt_boxes]
+            result['boxes'] = [x.tolist() for x in dt_boxes]
             result['rec_res'] = rec_res
 
         tic = time.time()

+ 3 - 5
requirements.txt

@@ -7,13 +7,11 @@ tqdm
 numpy
 visualdl
 rapidfuzz
-opencv-python==4.6.0.66
-opencv-contrib-python==4.6.0.66
+opencv-python<=4.6.0.66
+opencv-contrib-python<=4.6.0.66
 cython
 lxml
 premailer
 openpyxl
 attrdict
-Polygon3
-lanms-neo==1.0.2
-PyMuPDF<1.21.0
+PyMuPDF<1.21.0

+ 4 - 2
test_tipc/prepare.sh

@@ -150,7 +150,9 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
     # pretrain lite train data
     wget -nc -P  ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams  --no-check-certificate
     wget -nc -P ./pretrain_models/  https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar  --no-check-certificate
-    cd ./pretrain_models/ && tar xf det_mv3_db_v2.0_train.tar && cd ../
+    cd ./pretrain_models/
+    tar xf det_mv3_db_v2.0_train.tar
+    cd ../
     if [[ ${model_name} =~ "ch_PP-OCRv2_det" ]];then
         wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar --no-check-certificate
         cd ./pretrain_models/ && tar xf ch_PP-OCRv2_det_distill_train.tar && cd ../
@@ -366,7 +368,7 @@ elif [ ${MODE} = "whole_infer" ];then
     python_name_list=$(func_parser_value "${lines[2]}")
     array=(${python_name_list}) 
     python_name=${array[0]}
-    ${python_name} -m pip install paddleslim --force-reinstall
+    ${python_name} -m pip install paddleslim
     ${python_name} -m pip install -r requirements.txt
     wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
     wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate

+ 13 - 13
test_tipc/supplementary/train.py

@@ -168,22 +168,22 @@ def train(config, scaler=None):
             if idx % 10 == 0:
                 et = time.time()
                 strs = f"epoch: [{epoch}/{EPOCH}], iter: [{idx}/{data_num}], "
-                strs += f"loss: {avg_loss.numpy()[0]}"
-                strs += f", acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}"
+                strs += f"loss: {float(avg_loss)}"
+                strs += f", acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}"
                 strs += f", batch_time: {round(et-st, 4)} s"
                 logger.info(strs)
                 st = time.time()
 
         if epoch % 10 == 0:
             acc = eval(config, model)
-            if len(best_acc) < 1 or acc['top5'].numpy()[0] > best_acc['top5']:
+            if len(best_acc) < 1 or float(acc['top5']) > best_acc['top5']:
                 best_acc = acc
                 best_acc['epoch'] = epoch
                 is_best = True
             else:
                 is_best = False
             logger.info(
-                f"The best acc: acc_topk1: {best_acc['top1'].numpy()[0]}, acc_top5: {best_acc['top5'].numpy()[0]}, best_epoch: {best_acc['epoch']}"
+                f"The best acc: acc_topk1: {float(best_acc['top1'])}, acc_top5: {float(best_acc['top5'])}, best_epoch: {best_acc['epoch']}"
             )
             save_model(
                 model,
@@ -276,22 +276,22 @@ def train_distill(config, scaler=None):
             if idx % 10 == 0:
                 et = time.time()
                 strs = f"epoch: [{epoch}/{EPOCH}], iter: [{idx}/{data_num}], "
-                strs += f"loss: {avg_loss.numpy()[0]}"
-                strs += f", acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}"
+                strs += f"loss: {float(avg_loss)}"
+                strs += f", acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}"
                 strs += f", batch_time: {round(et-st, 4)} s"
                 logger.info(strs)
                 st = time.time()
 
         if epoch % 10 == 0:
             acc = eval(config, model._layers.student)
-            if len(best_acc) < 1 or acc['top5'].numpy()[0] > best_acc['top5']:
+            if len(best_acc) < 1 or float(acc['top5']) > best_acc['top5']:
                 best_acc = acc
                 best_acc['epoch'] = epoch
                 is_best = True
             else:
                 is_best = False
             logger.info(
-                f"The best acc: acc_topk1: {best_acc['top1'].numpy()[0]}, acc_top5: {best_acc['top5'].numpy()[0]}, best_epoch: {best_acc['epoch']}"
+                f"The best acc: acc_topk1: {float(best_acc['top1'])}, acc_top5: {float(best_acc['top5'])}, best_epoch: {best_acc['epoch']}"
             )
 
             save_model(
@@ -401,22 +401,22 @@ def train_distill_multiopt(config, scaler=None):
             if idx % 10 == 0:
                 et = time.time()
                 strs = f"epoch: [{epoch}/{EPOCH}], iter: [{idx}/{data_num}], "
-                strs += f"loss: {avg_loss.numpy()[0]}, loss1: {avg_loss1.numpy()[0]}"
-                strs += f", acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}"
+                strs += f"loss: {float(avg_loss)}, loss1: {float(avg_loss1)}"
+                strs += f", acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}"
                 strs += f", batch_time: {round(et-st, 4)} s"
                 logger.info(strs)
                 st = time.time()
 
         if epoch % 10 == 0:
             acc = eval(config, model._layers.student)
-            if len(best_acc) < 1 or acc['top5'].numpy()[0] > best_acc['top5']:
+            if len(best_acc) < 1 or float(acc['top5']) > best_acc['top5']:
                 best_acc = acc
                 best_acc['epoch'] = epoch
                 is_best = True
             else:
                 is_best = False
             logger.info(
-                f"The best acc: acc_topk1: {best_acc['top1'].numpy()[0]}, acc_top5: {best_acc['top5'].numpy()[0]}, best_epoch: {best_acc['epoch']}"
+                f"The best acc: acc_topk1: {float(best_acc['top1'])}, acc_top5: {float(best_acc['top5'])}, best_epoch: {best_acc['epoch']}"
             )
             save_model(
                 model, [optimizer, optimizer1],
@@ -450,7 +450,7 @@ def eval(config, model):
     labels = paddle.concat(labels, axis=0)
     acc = metric_func(outs, labels)
 
-    strs = f"The metric are as follows: acc_topk1: {acc['top1'].numpy()[0]}, acc_top5: {acc['top5'].numpy()[0]}"
+    strs = f"The metric are as follows: acc_topk1: {float(acc['top1'])}, acc_top5: {float(acc['top5'])}"
     logger.info(strs)
     return acc
 

+ 5 - 2
test_tipc/test_serving_infer_cpp.sh

@@ -103,7 +103,9 @@ function func_serving(){
             last_status=${PIPESTATUS[0]}
             eval "cat ${_save_log_path}"
             status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}"
-            ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9
+            #ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9
+            ${python_list[0]} ${web_service_py} stop
+            sleep 5s
         else
             server_log_path="${LOG_PATH}/cpp_server_gpu.log"
             web_service_cpp_cmd="nohup ${python_list[0]} ${web_service_py} --model ${det_server_value} ${rec_server_value} ${op_key} ${op_value} ${port_key} ${port_value} ${gpu_key} ${gpu_id} > ${server_log_path} 2>&1 &"
@@ -115,7 +117,8 @@ function func_serving(){
             last_status=${PIPESTATUS[0]}
             eval "cat ${_save_log_path}" 
             status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}"
-            ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9
+            #ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9
+            ${python_list[0]} ${web_service_py} stop
         fi
     done
 }

+ 3 - 1
tools/infer/predict_det.py

@@ -143,7 +143,9 @@ class TextDetector(object):
 
         if self.use_onnx:
             img_h, img_w = self.input_tensor.shape[2:]
-            if img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
+            if isinstance(img_h, str) or isinstance(img_w, str):
+                pass
+            elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
                 pre_process_list[0] = {
                     'DetResizeForTest': {
                         'image_shape': [img_h, img_w]

+ 3 - 2
tools/infer/predict_rec.py

@@ -173,9 +173,10 @@ class TextRecognizer(object):
         imgW = int((imgH * max_wh_ratio))
         if self.use_onnx:
             w = self.input_tensor.shape[3:][0]
-            if w is not None and w > 0:
+            if isinstance(w, str):
+                pass
+            elif w is not None and w > 0:
                 imgW = w
-
         h, w = img.shape[:2]
         ratio = w / float(h)
         if math.ceil(imgH * ratio) > imgW:

+ 3 - 2
tools/train.py

@@ -152,9 +152,10 @@ def main(config, device, logger, vdl_writer):
         AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, }
         if paddle.is_compiled_with_cuda():
             AMP_RELATED_FLAGS_SETTING.update({
-                'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+                'FLAGS_gemm_use_half_precision_compute_type': 0,
             })
-        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+        paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
         scale_loss = config["Global"].get("scale_loss", 1.0)
         use_dynamic_loss_scaling = config["Global"].get(
             "use_dynamic_loss_scaling", False)