_BASE_: [ '../datasets/coco_instance.yml', '../runtime.yml', './_base_/mask_rcnn_reader.yml', './_base_/optimizer_base_1x.yml' ] weights: output/mask_rcnn_vit_base_hrfpn_cae_1x_coco/model_final # runtime log_iter: 100 snapshot_epoch: 1 norm_type: sync_bn use_fused_allreduce_gradients: &use_checkpoint False architecture: MaskRCNN MaskRCNN: backbone: VisionTransformer neck: HRFPN rpn_head: RPNHead bbox_head: BBoxHead mask_head: MaskHead # post process bbox_post_process: BBoxPostProcess mask_post_process: MaskPostProcess VisionTransformer: patch_size: 16 embed_dim: 768 depth: 12 num_heads: 12 mlp_ratio: 4 qkv_bias: True drop_rate: 0.0 drop_path_rate: 0.2 init_values: 0.1 final_norm: False use_rel_pos_bias: False use_sincos_pos_emb: True epsilon: 0.000001 # 1e-6 out_indices: [3, 5, 7, 11] with_fpn: True use_checkpoint: *use_checkpoint pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams HRFPN: out_channel: 256 use_bias: True RPNHead: anchor_generator: aspect_ratios: [0.5, 1.0, 2.0] anchor_sizes: [[32], [64], [128], [256], [512]] strides: [4, 8, 16, 32, 64] rpn_target_assign: batch_size_per_im: 256 fg_fraction: 0.5 negative_overlap: 0.3 positive_overlap: 0.7 use_random: True train_proposal: min_size: 0.0 nms_thresh: 0.7 pre_nms_top_n: 2000 post_nms_top_n: 1000 topk_after_collect: True test_proposal: min_size: 0.0 nms_thresh: 0.7 pre_nms_top_n: 1000 post_nms_top_n: 1000 loss_rpn_bbox: SmoothL1Loss SmoothL1Loss: beta: 0.1111111111111111 BBoxHead: head: XConvNormHead roi_extractor: resolution: 7 sampling_ratio: 0 aligned: True bbox_assigner: BBoxAssigner loss_normalize_pos: True bbox_loss: GIoULoss BBoxAssigner: batch_size_per_im: 512 bg_thresh: 0.5 fg_thresh: 0.5 fg_fraction: 0.25 use_random: True XConvNormHead: num_convs: 4 norm_type: bn GIoULoss: loss_weight: 10. reduction: 'none' eps: 0.000001 BBoxPostProcess: decode: RCNNBox nms: name: MultiClassNMS keep_top_k: 100 score_threshold: 0.05 nms_threshold: 0.5 MaskHead: head: MaskFeat roi_extractor: resolution: 14 sampling_ratio: 0 aligned: True mask_assigner: MaskAssigner share_bbox_feat: False MaskFeat: num_convs: 4 out_channel: 256 norm_type: ~ MaskAssigner: mask_resolution: 28 MaskPostProcess: binary_thresh: 0.5