123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- Global:
- use_gpu: True
- epoch_num: &epoch_num 200
- log_smooth_window: 10
- print_batch_step: 10
- save_model_dir: ./output/ser_layoutxlm_xfund_zh
- save_epoch_step: 2000
- # evaluation is run every 10 iterations after the 0th iteration
- eval_batch_step: [ 0, 187 ]
- cal_metric_during_train: False
- save_inference_dir:
- use_visualdl: False
- seed: 2022
- infer_img: ppstructure/docs/kie/input/zh_val_42.jpg
- save_res_path: ./output/ser_layoutxlm_xfund_zh/res
- Architecture:
- model_type: kie
- algorithm: &algorithm "LayoutXLM"
- Transform:
- Backbone:
- name: LayoutXLMForSer
- pretrained: True
- checkpoints:
- num_classes: &num_classes 7
- Loss:
- name: VQASerTokenLayoutLMLoss
- num_classes: *num_classes
- key: "backbone_out"
- Optimizer:
- name: AdamW
- beta1: 0.9
- beta2: 0.999
- lr:
- name: Linear
- learning_rate: 0.00005
- epochs: *epoch_num
- warmup_epoch: 2
- regularizer:
- name: L2
- factor: 0.00000
-
- PostProcess:
- name: VQASerTokenLayoutLMPostProcess
- class_path: &class_path train_data/XFUND/class_list_xfun.txt
- Metric:
- name: VQASerTokenMetric
- main_indicator: hmean
- Train:
- dataset:
- name: SimpleDataSet
- data_dir: train_data/XFUND/zh_train/image
- label_file_list:
- - train_data/XFUND/zh_train/train.json
- ratio_list: [ 1.0 ]
- transforms:
- - DecodeImage: # load image
- img_mode: RGB
- channel_first: False
- - VQATokenLabelEncode: # Class handling label
- contains_re: False
- algorithm: *algorithm
- class_path: *class_path
- - VQATokenPad:
- max_seq_len: &max_seq_len 512
- return_attention_mask: True
- - VQASerTokenChunk:
- max_seq_len: *max_seq_len
- - Resize:
- size: [224,224]
- - NormalizeImage:
- scale: 1
- mean: [ 123.675, 116.28, 103.53 ]
- std: [ 58.395, 57.12, 57.375 ]
- order: 'hwc'
- - ToCHWImage:
- - KeepKeys:
- keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
- loader:
- shuffle: True
- drop_last: False
- batch_size_per_card: 8
- num_workers: 4
- Eval:
- dataset:
- name: SimpleDataSet
- data_dir: train_data/XFUND/zh_val/image
- label_file_list:
- - train_data/XFUND/zh_val/val.json
- transforms:
- - DecodeImage: # load image
- img_mode: RGB
- channel_first: False
- - VQATokenLabelEncode: # Class handling label
- contains_re: False
- algorithm: *algorithm
- class_path: *class_path
- - VQATokenPad:
- max_seq_len: *max_seq_len
- return_attention_mask: True
- - VQASerTokenChunk:
- max_seq_len: *max_seq_len
- - Resize:
- size: [224,224]
- - NormalizeImage:
- scale: 1
- mean: [ 123.675, 116.28, 103.53 ]
- std: [ 58.395, 57.12, 57.375 ]
- order: 'hwc'
- - ToCHWImage:
- - KeepKeys:
- keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
- loader:
- shuffle: False
- drop_last: False
- batch_size_per_card: 8
- num_workers: 4
|