_BASE_: [
  '../datasets/coco_detection.yml',
  '../runtime.yml',
  './_base_/faster_rcnn_reader.yml',
  './_base_/optimizer_base_1x.yml'
]

weights: output/faster_rcnn_vit_base_fpn_cae_1x_coco/model_final


# runtime
log_iter: 100
snapshot_epoch: 1
find_unused_parameters: True

use_gpu: true
norm_type: sync_bn

OptimizerBuilder:
  optimizer:
    weight_decay: 0.05

# reader
worker_num: 2
TrainReader:
  batch_size: 1


# model
architecture: FasterRCNN

FasterRCNN:
  backbone: VisionTransformer
  neck: FPN
  rpn_head: RPNHead
  bbox_head: BBoxHead
  bbox_post_process: BBoxPostProcess

VisionTransformer:
  patch_size: 16
  embed_dim: 768
  depth: 12
  num_heads: 12
  mlp_ratio: 4
  qkv_bias: True
  drop_rate: 0.0
  drop_path_rate: 0.2
  init_values: 0.1
  final_norm: False
  use_rel_pos_bias: False
  use_sincos_pos_emb: True
  epsilon: 0.000001 # 1e-6
  out_indices: [3, 5, 7, 11]
  with_fpn: True
  pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams


FPN:
  out_channel: 256

RPNHead:
  anchor_generator:
    aspect_ratios: [0.5, 1.0, 2.0]
    anchor_sizes: [[32], [64], [128], [256], [512]]
    strides: [4, 8, 16, 32, 64]
  rpn_target_assign:
    batch_size_per_im: 256
    fg_fraction: 0.5
    negative_overlap: 0.3
    positive_overlap: 0.7
    use_random: True
  train_proposal:
    min_size: 0.0
    nms_thresh: 0.7
    pre_nms_top_n: 2000
    post_nms_top_n: 1000
    topk_after_collect: True
  test_proposal:
    min_size: 0.0
    nms_thresh: 0.7
    pre_nms_top_n: 1000
    post_nms_top_n: 1000
  loss_rpn_bbox: SmoothL1Loss


SmoothL1Loss:
  beta: 0.1111111111111111


BBoxHead:
  # head: TwoFCHead
  head: XConvNormHead
  roi_extractor:
    resolution: 7
    sampling_ratio: 0
    aligned: True
  bbox_assigner: BBoxAssigner
  loss_normalize_pos: True
  bbox_loss: GIoULoss


GIoULoss:
  loss_weight: 10.
  reduction: 'none'
  eps: 0.000001 # 1e-6


BBoxAssigner:
  batch_size_per_im: 512
  bg_thresh: 0.5
  fg_thresh: 0.5
  fg_fraction: 0.25
  use_random: True

# TwoFCHead:
#   out_channel: 1024

XConvNormHead:
  num_convs: 4
  norm_type: bn


BBoxPostProcess:
  decode: RCNNBox
  nms:
    name: MultiClassNMS
    keep_top_k: 100
    score_threshold: 0.05
    nms_threshold: 0.5