|
| 1 | +# @package _global_ |
| 2 | + |
| 3 | +# Model |
| 4 | +model: |
| 5 | + _target_: sam2.modeling.sam2_base.SAM2Base |
| 6 | + image_encoder: |
| 7 | + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder |
| 8 | + scalp: 1 |
| 9 | + trunk: |
| 10 | + _target_: sam2.modeling.backbones.hieradet.Hiera |
| 11 | + embed_dim: 144 |
| 12 | + num_heads: 2 |
| 13 | + stages: [2, 6, 36, 4] |
| 14 | + global_att_blocks: [23, 33, 43] |
| 15 | + window_pos_embed_bkg_spatial_size: [7, 7] |
| 16 | + window_spec: [8, 4, 16, 8] |
| 17 | + neck: |
| 18 | + _target_: sam2.modeling.backbones.image_encoder.FpnNeck |
| 19 | + position_encoding: |
| 20 | + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| 21 | + num_pos_feats: 256 |
| 22 | + normalize: true |
| 23 | + scale: null |
| 24 | + temperature: 10000 |
| 25 | + d_model: 256 |
| 26 | + backbone_channel_list: [1152, 576, 288, 144] |
| 27 | + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features |
| 28 | + fpn_interp_model: nearest |
| 29 | + |
| 30 | + memory_attention: |
| 31 | + _target_: sam2.modeling.memory_attention.MemoryAttention |
| 32 | + d_model: 256 |
| 33 | + pos_enc_at_input: true |
| 34 | + layer: |
| 35 | + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer |
| 36 | + activation: relu |
| 37 | + dim_feedforward: 2048 |
| 38 | + dropout: 0.1 |
| 39 | + pos_enc_at_attn: false |
| 40 | + self_attention: |
| 41 | + _target_: sam2.modeling.sam.transformer.RoPEAttention |
| 42 | + rope_theta: 10000.0 |
| 43 | + feat_sizes: [64, 64] |
| 44 | + embedding_dim: 256 |
| 45 | + num_heads: 1 |
| 46 | + downsample_rate: 1 |
| 47 | + dropout: 0.1 |
| 48 | + d_model: 256 |
| 49 | + pos_enc_at_cross_attn_keys: true |
| 50 | + pos_enc_at_cross_attn_queries: false |
| 51 | + cross_attention: |
| 52 | + _target_: sam2.modeling.sam.transformer.RoPEAttention |
| 53 | + rope_theta: 10000.0 |
| 54 | + feat_sizes: [64, 64] |
| 55 | + rope_k_repeat: True |
| 56 | + embedding_dim: 256 |
| 57 | + num_heads: 1 |
| 58 | + downsample_rate: 1 |
| 59 | + dropout: 0.1 |
| 60 | + kv_in_dim: 64 |
| 61 | + num_layers: 4 |
| 62 | + |
| 63 | + memory_encoder: |
| 64 | + _target_: sam2.modeling.memory_encoder.MemoryEncoder |
| 65 | + out_dim: 64 |
| 66 | + position_encoding: |
| 67 | + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| 68 | + num_pos_feats: 64 |
| 69 | + normalize: true |
| 70 | + scale: null |
| 71 | + temperature: 10000 |
| 72 | + mask_downsampler: |
| 73 | + _target_: sam2.modeling.memory_encoder.MaskDownSampler |
| 74 | + kernel_size: 3 |
| 75 | + stride: 2 |
| 76 | + padding: 1 |
| 77 | + fuser: |
| 78 | + _target_: sam2.modeling.memory_encoder.Fuser |
| 79 | + layer: |
| 80 | + _target_: sam2.modeling.memory_encoder.CXBlock |
| 81 | + dim: 256 |
| 82 | + kernel_size: 7 |
| 83 | + padding: 3 |
| 84 | + layer_scale_init_value: 1e-6 |
| 85 | + use_dwconv: True # depth-wise convs |
| 86 | + num_layers: 2 |
| 87 | + |
| 88 | + num_maskmem: 7 |
| 89 | + image_size: 1024 |
| 90 | + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask |
| 91 | + sigmoid_scale_for_mem_enc: 20.0 |
| 92 | + sigmoid_bias_for_mem_enc: -10.0 |
| 93 | + use_mask_input_as_output_without_sam: true |
| 94 | + # Memory |
| 95 | + directly_add_no_mem_embed: true |
| 96 | + no_obj_embed_spatial: true |
| 97 | + # use high-resolution feature map in the SAM mask decoder |
| 98 | + use_high_res_features_in_sam: true |
| 99 | + # output 3 masks on the first click on initial conditioning frames |
| 100 | + multimask_output_in_sam: true |
| 101 | + # SAM heads |
| 102 | + iou_prediction_use_sigmoid: True |
| 103 | + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder |
| 104 | + use_obj_ptrs_in_encoder: true |
| 105 | + add_tpos_enc_to_obj_ptrs: true |
| 106 | + proj_tpos_enc_in_obj_ptrs: true |
| 107 | + use_signed_tpos_enc_to_obj_ptrs: true |
| 108 | + only_obj_ptrs_in_the_past_for_eval: true |
| 109 | + # object occlusion prediction |
| 110 | + pred_obj_scores: true |
| 111 | + pred_obj_scores_mlp: true |
| 112 | + fixed_no_obj_ptr: true |
| 113 | + # multimask tracking settings |
| 114 | + multimask_output_for_tracking: true |
| 115 | + use_multimask_token_for_obj_ptr: true |
| 116 | + multimask_min_pt_num: 0 |
| 117 | + multimask_max_pt_num: 1 |
| 118 | + use_mlp_for_obj_ptr_proj: true |
| 119 | + # Compilation flag |
| 120 | + compile_image_encoder: False |
0 commit comments