rg "HF_ENDPOINT=https://hf-mirror.com huggingface-cli download Qwen/Qwen3-Next-80B-A3B-Instruct --local-dir /data/disk1/guohaoran/models/Qwen3-Next-80B-A3B-Instruct" rg "HF_ENDPOINT=https://hf-mirror.com huggingface-cli download Qwen/Qwen3.5-397B-A17B --local-dir /data/disk1/guohaoran/models/Qwen3.5-397B-A17B" rg "HF_ENDPOINT=https://hf-mirror.com huggingface-cli download Qwen/Qwen3.5-35B-A3B --local-dir /data/disk1/guohaoran/models/Qwen3.5-35B-A3B" rg "HF_ENDPOINT=https://hf-mirror.com huggingface-cli download Qwen/Qwen3.5-0.8B --local-dir /data/disk1/guohaoran/models/Qwen3.5-0.8B"
Qwen 3.5 80B A3B
=== Embedding ===
embed_tokens.weight BF16 [151936, 2048]
=== Layer 0 (linear_attn) ===
linear_attn.dt_bias BF16 [32]
linear_attn.A_log BF16 [32]
linear_attn.conv1d.weight BF16 [8192, 1, 4]
linear_attn.in_proj_qkvz.weight BF16 [12288, 2048]
linear_attn.in_proj_ba.weight BF16 [64, 2048]
linear_attn.norm.weight BF16 [128]
linear_attn.out_proj.weight BF16 [2048, 4096]
mlp.gate.weight BF16 [512, 2048]
mlp.experts.gate_up_proj BF16 [512, 1024, 2048]
mlp.experts.down_proj BF16 [512, 2048, 512]
mlp.shared_expert.gate_proj.weight BF16 [512, 2048]
mlp.shared_expert.up_proj.weight BF16 [512, 2048]
mlp.shared_expert.down_proj.weight BF16 [2048, 512]
mlp.shared_expert_gate.weight BF16 [1, 2048]
input_layernorm.weight BF16 [2048]
post_attention_layernorm.weight BF16 [2048]
=== Layer 3 (self_attn) ===
self_attn.q_proj.weight BF16 [8192, 2048]
self_attn.k_proj.weight BF16 [512, 2048]
self_attn.v_proj.weight BF16 [512, 2048]
self_attn.o_proj.weight BF16 [2048, 4096]
self_attn.q_norm.weight BF16 [256]
self_attn.k_norm.weight BF16 [256]
mlp.gate.weight BF16 [512, 2048]
mlp.experts.gate_up_proj BF16 [512, 1024, 2048]
mlp.experts.down_proj BF16 [512, 2048, 512]
mlp.shared_expert.gate_proj.weight BF16 [512, 2048]
mlp.shared_expert.up_proj.weight BF16 [512, 2048]
mlp.shared_expert.down_proj.weight BF16 [2048, 512]
mlp.shared_expert_gate.weight BF16 [1, 2048]
input_layernorm.weight BF16 [2048]
post_attention_layernorm.weight BF16 [2048]
=== LM Head ===
lm_head.weight BF16 [151936, 2048]
Qwen 3.5 397B A3B
=== Embedding ===
embed_tokens.weight BF16 [248320, 4096]
=== Layer 0 (linear_attn) ===
linear_attn.dt_bias BF16 [64]
linear_attn.A_log BF16 [64]
linear_attn.conv1d.weight BF16 [12288, 1, 4]
linear_attn.norm.weight BF16 [128]
linear_attn.out_proj.weight BF16 [4096, 8192]
linear_attn.in_proj_qkv.weight BF16 [12288, 4096]
linear_attn.in_proj_z.weight BF16 [8192, 4096]
linear_attn.in_proj_b.weight BF16 [64, 4096]
linear_attn.in_proj_a.weight BF16 [64, 4096]
mlp.gate.weight BF16 [512, 4096]
mlp.experts.gate_up_proj BF16 [512, 2048, 4096]
mlp.experts.down_proj BF16 [512, 4096, 1024]
mlp.shared_expert.gate_proj.weight BF16 [1024, 4096]
mlp.shared_expert.up_proj.weight BF16 [1024, 4096]
mlp.shared_expert.down_proj.weight BF16 [4096, 1024]
mlp.shared_expert_gate.weight BF16 [1, 4096]
input_layernorm.weight BF16 [4096]
post_attention_layernorm.weight BF16 [4096]
=== Layer 3 (self_attn) ===
self_attn.q_proj.weight BF16 [16384, 4096]
self_attn.k_proj.weight BF16 [512, 4096]
self_attn.v_proj.weight BF16 [512, 4096]
self_attn.o_proj.weight BF16 [4096, 8192]
self_attn.q_norm.weight BF16 [256]
self_attn.k_norm.weight BF16 [256]
mlp.gate.weight BF16 [512, 4096]
mlp.experts.gate_up_proj BF16 [512, 2048, 4096]
mlp.experts.down_proj BF16 [512, 4096, 1024]
mlp.shared_expert.gate_proj.weight BF16 [1024, 4096]
mlp.shared_expert.up_proj.weight BF16 [1024, 4096]
mlp.shared_expert.down_proj.weight BF16 [4096, 1024]
mlp.shared_expert_gate.weight BF16 [1, 4096]
input_layernorm.weight BF16 [4096]
post_attention_layernorm.weight BF16 [4096]
=== LM Head ===
lm_head.weight BF16 [248320, 4096]