Models
GitHub
Discord
Docs
Cloud
Sign in
Download
Models
Download
GitHub
Discord
Docs
Cloud
Sign in
cogito-2.1
:latest
5
Downloads
Updated
2 minutes ago
The Cogito v2.1 LLMs are instruction tuned generative models. All models are released under MIT license for commercial use.
The Cogito v2.1 LLMs are instruction tuned generative models. All models are released under MIT license for commercial use.
Cancel
cloud
671b
cogito-2.1:latest
...
/
model
fca0589090b8 · 1.3TB
Metadata
general.architecture
deepseek2
deepseek2
general.file_type
F16
F16
deepseek2.attention.head_count
128
128
deepseek2.attention.head_count_kv
1
1
deepseek2.attention.key_length
576
576
deepseek2.attention.key_length_mla
192
192
deepseek2.attention.kv_lora_rank
512
512
deepseek2.attention.layer_norm_rms_epsilon
1e-06
1e-06
deepseek2.attention.q_lora_rank
1536
1536
deepseek2.attention.value_length
512
512
deepseek2.attention.value_length_mla
128
128
deepseek2.block_count
61
61
deepseek2.context_length
163840
163840
deepseek2.embedding_length
7168
7168
deepseek2.expert_count
256
256
deepseek2.expert_feed_forward_length
2048
2048
deepseek2.expert_gating_func
2
2
deepseek2.expert_group_count
8
8
deepseek2.expert_group_used_count
4
4
deepseek2.expert_shared_count
1
1
deepseek2.expert_used_count
8
8
deepseek2.expert_weights_norm
true
true
deepseek2.expert_weights_scale
2.5
2.5
deepseek2.feed_forward_length
18432
18432
deepseek2.leading_dense_block_count
3
3
deepseek2.rope.dimension_count
64
64
deepseek2.rope.freq_base
10000
10000
deepseek2.rope.scaling.factor
40
40
deepseek2.rope.scaling.original_context_length
4096
4096
deepseek2.rope.scaling.type
yarn
yarn
deepseek2.rope.scaling.yarn_log_multiplier
0.1
0.1
deepseek2.vocab_size
128815
128815
tokenizer.ggml.add_bos_token
true
true
tokenizer.ggml.add_eos_token
false
false
tokenizer.ggml.add_sep_token
false
false
tokenizer.ggml.bos_token_id
0
0
tokenizer.ggml.eos_token_id
1
1
tokenizer.ggml.merges
[Ġ t, Ġ a, i n, Ġ Ġ, h e, ...]
[Ġ t, Ġ a, i n, Ġ Ġ, h e, ...]
tokenizer.ggml.model
gpt2
gpt2
tokenizer.ggml.padding_token_id
2
2
tokenizer.ggml.pre
deepseek-v3
deepseek-v3
tokenizer.ggml.token_type
[3, 3, 3, 1, 1, ...]
[3, 3, 3, 1, 1, ...]
tokenizer.ggml.tokens
[<|begin▁of▁sentence|>, <|end▁of▁sentence|>, <|▁pad▁|>, !, ", ...]
[<|begin▁of▁sentence|>, <|end▁of▁sentence|>, <|▁pad▁|>, !, ", ...]
Tensor
Name
Type
Shape
token_embd.weight
F16
F16
[7168, 128815]
blk.0
blk.0.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.0.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.0.attn_kv_a_norm.weight
F32
F32
[512]
blk.0.attn_norm.weight
F32
F32
[7168]
blk.0.attn_output.weight
F16
F16
[16384, 7168]
blk.0.attn_q_a.weight
F16
F16
[7168, 1536]
blk.0.attn_q_a_norm.weight
F32
F32
[1536]
blk.0.attn_q_b.weight
F16
F16
[1536, 24576]
blk.0.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.0.ffn_down.weight
F16
F16
[18432, 7168]
blk.0.ffn_gate.weight
F16
F16
[7168, 18432]
blk.0.ffn_norm.weight
F32
F32
[7168]
blk.0.ffn_up.weight
F16
F16
[7168, 18432]
blk.1
blk.1.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.1.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.1.attn_kv_a_norm.weight
F32
F32
[512]
blk.1.attn_norm.weight
F32
F32
[7168]
blk.1.attn_output.weight
F16
F16
[16384, 7168]
blk.1.attn_q_a.weight
F16
F16
[7168, 1536]
blk.1.attn_q_a_norm.weight
F32
F32
[1536]
blk.1.attn_q_b.weight
F16
F16
[1536, 24576]
blk.1.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.1.ffn_down.weight
F16
F16
[18432, 7168]
blk.1.ffn_gate.weight
F16
F16
[7168, 18432]
blk.1.ffn_norm.weight
F32
F32
[7168]
blk.1.ffn_up.weight
F16
F16
[7168, 18432]
blk.2
blk.2.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.2.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.2.attn_kv_a_norm.weight
F32
F32
[512]
blk.2.attn_norm.weight
F32
F32
[7168]
blk.2.attn_output.weight
F16
F16
[16384, 7168]
blk.2.attn_q_a.weight
F16
F16
[7168, 1536]
blk.2.attn_q_a_norm.weight
F32
F32
[1536]
blk.2.attn_q_b.weight
F16
F16
[1536, 24576]
blk.2.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.2.ffn_down.weight
F16
F16
[18432, 7168]
blk.2.ffn_gate.weight
F16
F16
[7168, 18432]
blk.2.ffn_norm.weight
F32
F32
[7168]
blk.2.ffn_up.weight
F16
F16
[7168, 18432]
blk.3
blk.3.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.3.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.3.attn_kv_a_norm.weight
F32
F32
[512]
blk.3.attn_norm.weight
F32
F32
[7168]
blk.3.attn_output.weight
F16
F16
[16384, 7168]
blk.3.attn_q_a.weight
F16
F16
[7168, 1536]
blk.3.attn_q_a_norm.weight
F32
F32
[1536]
blk.3.attn_q_b.weight
F16
F16
[1536, 24576]
blk.3.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.3.exp_probs_b.bias
F32
F32
[256]
blk.3.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.3.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.3.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.3.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.3.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.3.ffn_norm.weight
F32
F32
[7168]
blk.3.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.3.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.4
blk.4.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.4.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.4.attn_kv_a_norm.weight
F32
F32
[512]
blk.4.attn_norm.weight
F32
F32
[7168]
blk.4.attn_output.weight
F16
F16
[16384, 7168]
blk.4.attn_q_a.weight
F16
F16
[7168, 1536]
blk.4.attn_q_a_norm.weight
F32
F32
[1536]
blk.4.attn_q_b.weight
F16
F16
[1536, 24576]
blk.4.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.4.exp_probs_b.bias
F32
F32
[256]
blk.4.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.4.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.4.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.4.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.4.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.4.ffn_norm.weight
F32
F32
[7168]
blk.4.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.4.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.5
blk.5.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.5.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.5.attn_kv_a_norm.weight
F32
F32
[512]
blk.5.attn_norm.weight
F32
F32
[7168]
blk.5.attn_output.weight
F16
F16
[16384, 7168]
blk.5.attn_q_a.weight
F16
F16
[7168, 1536]
blk.5.attn_q_a_norm.weight
F32
F32
[1536]
blk.5.attn_q_b.weight
F16
F16
[1536, 24576]
blk.5.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.5.exp_probs_b.bias
F32
F32
[256]
blk.5.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.5.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.5.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.5.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.5.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.5.ffn_norm.weight
F32
F32
[7168]
blk.5.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.5.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.6
blk.6.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.6.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.6.attn_kv_a_norm.weight
F32
F32
[512]
blk.6.attn_norm.weight
F32
F32
[7168]
blk.6.attn_output.weight
F16
F16
[16384, 7168]
blk.6.attn_q_a.weight
F16
F16
[7168, 1536]
blk.6.attn_q_a_norm.weight
F32
F32
[1536]
blk.6.attn_q_b.weight
F16
F16
[1536, 24576]
blk.6.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.6.exp_probs_b.bias
F32
F32
[256]
blk.6.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.6.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.6.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.6.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.6.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.6.ffn_norm.weight
F32
F32
[7168]
blk.6.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.6.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.7
blk.7.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.7.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.7.attn_kv_a_norm.weight
F32
F32
[512]
blk.7.attn_norm.weight
F32
F32
[7168]
blk.7.attn_output.weight
F16
F16
[16384, 7168]
blk.7.attn_q_a.weight
F16
F16
[7168, 1536]
blk.7.attn_q_a_norm.weight
F32
F32
[1536]
blk.7.attn_q_b.weight
F16
F16
[1536, 24576]
blk.7.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.7.exp_probs_b.bias
F32
F32
[256]
blk.7.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.7.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.7.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.7.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.7.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.7.ffn_norm.weight
F32
F32
[7168]
blk.7.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.7.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.8
blk.8.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.8.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.8.attn_kv_a_norm.weight
F32
F32
[512]
blk.8.attn_norm.weight
F32
F32
[7168]
blk.8.attn_output.weight
F16
F16
[16384, 7168]
blk.8.attn_q_a.weight
F16
F16
[7168, 1536]
blk.8.attn_q_a_norm.weight
F32
F32
[1536]
blk.8.attn_q_b.weight
F16
F16
[1536, 24576]
blk.8.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.8.exp_probs_b.bias
F32
F32
[256]
blk.8.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.8.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.8.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.8.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.8.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.8.ffn_norm.weight
F32
F32
[7168]
blk.8.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.8.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.9
blk.9.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.9.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.9.attn_kv_a_norm.weight
F32
F32
[512]
blk.9.attn_norm.weight
F32
F32
[7168]
blk.9.attn_output.weight
F16
F16
[16384, 7168]
blk.9.attn_q_a.weight
F16
F16
[7168, 1536]
blk.9.attn_q_a_norm.weight
F32
F32
[1536]
blk.9.attn_q_b.weight
F16
F16
[1536, 24576]
blk.9.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.9.exp_probs_b.bias
F32
F32
[256]
blk.9.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.9.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.9.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.9.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.9.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.9.ffn_norm.weight
F32
F32
[7168]
blk.9.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.9.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.10
blk.10.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.10.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.10.attn_kv_a_norm.weight
F32
F32
[512]
blk.10.attn_norm.weight
F32
F32
[7168]
blk.10.attn_output.weight
F16
F16
[16384, 7168]
blk.10.attn_q_a.weight
F16
F16
[7168, 1536]
blk.10.attn_q_a_norm.weight
F32
F32
[1536]
blk.10.attn_q_b.weight
F16
F16
[1536, 24576]
blk.10.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.10.exp_probs_b.bias
F32
F32
[256]
blk.10.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.10.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.10.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.10.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.10.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.10.ffn_norm.weight
F32
F32
[7168]
blk.10.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.10.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.11
blk.11.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.11.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.11.attn_kv_a_norm.weight
F32
F32
[512]
blk.11.attn_norm.weight
F32
F32
[7168]
blk.11.attn_output.weight
F16
F16
[16384, 7168]
blk.11.attn_q_a.weight
F16
F16
[7168, 1536]
blk.11.attn_q_a_norm.weight
F32
F32
[1536]
blk.11.attn_q_b.weight
F16
F16
[1536, 24576]
blk.11.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.11.exp_probs_b.bias
F32
F32
[256]
blk.11.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.11.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.11.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.11.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.11.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.11.ffn_norm.weight
F32
F32
[7168]
blk.11.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.11.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.12
blk.12.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.12.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.12.attn_kv_a_norm.weight
F32
F32
[512]
blk.12.attn_norm.weight
F32
F32
[7168]
blk.12.attn_output.weight
F16
F16
[16384, 7168]
blk.12.attn_q_a.weight
F16
F16
[7168, 1536]
blk.12.attn_q_a_norm.weight
F32
F32
[1536]
blk.12.attn_q_b.weight
F16
F16
[1536, 24576]
blk.12.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.12.exp_probs_b.bias
F32
F32
[256]
blk.12.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.12.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.12.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.12.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.12.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.12.ffn_norm.weight
F32
F32
[7168]
blk.12.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.12.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.13
blk.13.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.13.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.13.attn_kv_a_norm.weight
F32
F32
[512]
blk.13.attn_norm.weight
F32
F32
[7168]
blk.13.attn_output.weight
F16
F16
[16384, 7168]
blk.13.attn_q_a.weight
F16
F16
[7168, 1536]
blk.13.attn_q_a_norm.weight
F32
F32
[1536]
blk.13.attn_q_b.weight
F16
F16
[1536, 24576]
blk.13.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.13.exp_probs_b.bias
F32
F32
[256]
blk.13.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.13.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.13.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.13.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.13.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.13.ffn_norm.weight
F32
F32
[7168]
blk.13.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.13.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.14
blk.14.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.14.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.14.attn_kv_a_norm.weight
F32
F32
[512]
blk.14.attn_norm.weight
F32
F32
[7168]
blk.14.attn_output.weight
F16
F16
[16384, 7168]
blk.14.attn_q_a.weight
F16
F16
[7168, 1536]
blk.14.attn_q_a_norm.weight
F32
F32
[1536]
blk.14.attn_q_b.weight
F16
F16
[1536, 24576]
blk.14.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.14.exp_probs_b.bias
F32
F32
[256]
blk.14.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.14.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.14.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.14.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.14.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.14.ffn_norm.weight
F32
F32
[7168]
blk.14.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.14.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.15
blk.15.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.15.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.15.attn_kv_a_norm.weight
F32
F32
[512]
blk.15.attn_norm.weight
F32
F32
[7168]
blk.15.attn_output.weight
F16
F16
[16384, 7168]
blk.15.attn_q_a.weight
F16
F16
[7168, 1536]
blk.15.attn_q_a_norm.weight
F32
F32
[1536]
blk.15.attn_q_b.weight
F16
F16
[1536, 24576]
blk.15.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.15.exp_probs_b.bias
F32
F32
[256]
blk.15.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.15.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.15.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.15.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.15.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.15.ffn_norm.weight
F32
F32
[7168]
blk.15.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.15.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.16
blk.16.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.16.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.16.attn_kv_a_norm.weight
F32
F32
[512]
blk.16.attn_norm.weight
F32
F32
[7168]
blk.16.attn_output.weight
F16
F16
[16384, 7168]
blk.16.attn_q_a.weight
F16
F16
[7168, 1536]
blk.16.attn_q_a_norm.weight
F32
F32
[1536]
blk.16.attn_q_b.weight
F16
F16
[1536, 24576]
blk.16.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.16.exp_probs_b.bias
F32
F32
[256]
blk.16.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.16.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.16.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.16.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.16.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.16.ffn_norm.weight
F32
F32
[7168]
blk.16.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.16.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.17
blk.17.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.17.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.17.attn_kv_a_norm.weight
F32
F32
[512]
blk.17.attn_norm.weight
F32
F32
[7168]
blk.17.attn_output.weight
F16
F16
[16384, 7168]
blk.17.attn_q_a.weight
F16
F16
[7168, 1536]
blk.17.attn_q_a_norm.weight
F32
F32
[1536]
blk.17.attn_q_b.weight
F16
F16
[1536, 24576]
blk.17.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.17.exp_probs_b.bias
F32
F32
[256]
blk.17.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.17.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.17.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.17.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.17.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.17.ffn_norm.weight
F32
F32
[7168]
blk.17.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.17.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.18
blk.18.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.18.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.18.attn_kv_a_norm.weight
F32
F32
[512]
blk.18.attn_norm.weight
F32
F32
[7168]
blk.18.attn_output.weight
F16
F16
[16384, 7168]
blk.18.attn_q_a.weight
F16
F16
[7168, 1536]
blk.18.attn_q_a_norm.weight
F32
F32
[1536]
blk.18.attn_q_b.weight
F16
F16
[1536, 24576]
blk.18.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.18.exp_probs_b.bias
F32
F32
[256]
blk.18.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.18.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.18.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.18.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.18.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.18.ffn_norm.weight
F32
F32
[7168]
blk.18.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.18.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.19
blk.19.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.19.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.19.attn_kv_a_norm.weight
F32
F32
[512]
blk.19.attn_norm.weight
F32
F32
[7168]
blk.19.attn_output.weight
F16
F16
[16384, 7168]
blk.19.attn_q_a.weight
F16
F16
[7168, 1536]
blk.19.attn_q_a_norm.weight
F32
F32
[1536]
blk.19.attn_q_b.weight
F16
F16
[1536, 24576]
blk.19.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.19.exp_probs_b.bias
F32
F32
[256]
blk.19.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.19.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.19.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.19.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.19.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.19.ffn_norm.weight
F32
F32
[7168]
blk.19.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.19.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.20
blk.20.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.20.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.20.attn_kv_a_norm.weight
F32
F32
[512]
blk.20.attn_norm.weight
F32
F32
[7168]
blk.20.attn_output.weight
F16
F16
[16384, 7168]
blk.20.attn_q_a.weight
F16
F16
[7168, 1536]
blk.20.attn_q_a_norm.weight
F32
F32
[1536]
blk.20.attn_q_b.weight
F16
F16
[1536, 24576]
blk.20.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.20.exp_probs_b.bias
F32
F32
[256]
blk.20.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.20.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.20.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.20.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.20.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.20.ffn_norm.weight
F32
F32
[7168]
blk.20.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.20.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.21
blk.21.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.21.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.21.attn_kv_a_norm.weight
F32
F32
[512]
blk.21.attn_norm.weight
F32
F32
[7168]
blk.21.attn_output.weight
F16
F16
[16384, 7168]
blk.21.attn_q_a.weight
F16
F16
[7168, 1536]
blk.21.attn_q_a_norm.weight
F32
F32
[1536]
blk.21.attn_q_b.weight
F16
F16
[1536, 24576]
blk.21.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.21.exp_probs_b.bias
F32
F32
[256]
blk.21.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.21.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.21.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.21.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.21.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.21.ffn_norm.weight
F32
F32
[7168]
blk.21.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.21.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.22
blk.22.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.22.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.22.attn_kv_a_norm.weight
F32
F32
[512]
blk.22.attn_norm.weight
F32
F32
[7168]
blk.22.attn_output.weight
F16
F16
[16384, 7168]
blk.22.attn_q_a.weight
F16
F16
[7168, 1536]
blk.22.attn_q_a_norm.weight
F32
F32
[1536]
blk.22.attn_q_b.weight
F16
F16
[1536, 24576]
blk.22.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.22.exp_probs_b.bias
F32
F32
[256]
blk.22.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.22.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.22.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.22.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.22.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.22.ffn_norm.weight
F32
F32
[7168]
blk.22.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.22.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.23
blk.23.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.23.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.23.attn_kv_a_norm.weight
F32
F32
[512]
blk.23.attn_norm.weight
F32
F32
[7168]
blk.23.attn_output.weight
F16
F16
[16384, 7168]
blk.23.attn_q_a.weight
F16
F16
[7168, 1536]
blk.23.attn_q_a_norm.weight
F32
F32
[1536]
blk.23.attn_q_b.weight
F16
F16
[1536, 24576]
blk.23.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.23.exp_probs_b.bias
F32
F32
[256]
blk.23.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.23.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.23.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.23.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.23.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.23.ffn_norm.weight
F32
F32
[7168]
blk.23.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.23.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.24
blk.24.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.24.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.24.attn_kv_a_norm.weight
F32
F32
[512]
blk.24.attn_norm.weight
F32
F32
[7168]
blk.24.attn_output.weight
F16
F16
[16384, 7168]
blk.24.attn_q_a.weight
F16
F16
[7168, 1536]
blk.24.attn_q_a_norm.weight
F32
F32
[1536]
blk.24.attn_q_b.weight
F16
F16
[1536, 24576]
blk.24.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.24.exp_probs_b.bias
F32
F32
[256]
blk.24.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.24.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.24.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.24.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.24.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.24.ffn_norm.weight
F32
F32
[7168]
blk.24.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.24.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.25
blk.25.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.25.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.25.attn_kv_a_norm.weight
F32
F32
[512]
blk.25.attn_norm.weight
F32
F32
[7168]
blk.25.attn_output.weight
F16
F16
[16384, 7168]
blk.25.attn_q_a.weight
F16
F16
[7168, 1536]
blk.25.attn_q_a_norm.weight
F32
F32
[1536]
blk.25.attn_q_b.weight
F16
F16
[1536, 24576]
blk.25.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.25.exp_probs_b.bias
F32
F32
[256]
blk.25.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.25.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.25.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.25.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.25.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.25.ffn_norm.weight
F32
F32
[7168]
blk.25.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.25.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.26
blk.26.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.26.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.26.attn_kv_a_norm.weight
F32
F32
[512]
blk.26.attn_norm.weight
F32
F32
[7168]
blk.26.attn_output.weight
F16
F16
[16384, 7168]
blk.26.attn_q_a.weight
F16
F16
[7168, 1536]
blk.26.attn_q_a_norm.weight
F32
F32
[1536]
blk.26.attn_q_b.weight
F16
F16
[1536, 24576]
blk.26.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.26.exp_probs_b.bias
F32
F32
[256]
blk.26.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.26.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.26.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.26.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.26.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.26.ffn_norm.weight
F32
F32
[7168]
blk.26.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.26.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.27
blk.27.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.27.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.27.attn_kv_a_norm.weight
F32
F32
[512]
blk.27.attn_norm.weight
F32
F32
[7168]
blk.27.attn_output.weight
F16
F16
[16384, 7168]
blk.27.attn_q_a.weight
F16
F16
[7168, 1536]
blk.27.attn_q_a_norm.weight
F32
F32
[1536]
blk.27.attn_q_b.weight
F16
F16
[1536, 24576]
blk.27.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.27.exp_probs_b.bias
F32
F32
[256]
blk.27.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.27.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.27.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.27.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.27.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.27.ffn_norm.weight
F32
F32
[7168]
blk.27.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.27.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.28
blk.28.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.28.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.28.attn_kv_a_norm.weight
F32
F32
[512]
blk.28.attn_norm.weight
F32
F32
[7168]
blk.28.attn_output.weight
F16
F16
[16384, 7168]
blk.28.attn_q_a.weight
F16
F16
[7168, 1536]
blk.28.attn_q_a_norm.weight
F32
F32
[1536]
blk.28.attn_q_b.weight
F16
F16
[1536, 24576]
blk.28.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.28.exp_probs_b.bias
F32
F32
[256]
blk.28.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.28.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.28.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.28.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.28.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.28.ffn_norm.weight
F32
F32
[7168]
blk.28.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.28.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.29
blk.29.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.29.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.29.attn_kv_a_norm.weight
F32
F32
[512]
blk.29.attn_norm.weight
F32
F32
[7168]
blk.29.attn_output.weight
F16
F16
[16384, 7168]
blk.29.attn_q_a.weight
F16
F16
[7168, 1536]
blk.29.attn_q_a_norm.weight
F32
F32
[1536]
blk.29.attn_q_b.weight
F16
F16
[1536, 24576]
blk.29.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.29.exp_probs_b.bias
F32
F32
[256]
blk.29.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.29.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.29.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.29.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.29.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.29.ffn_norm.weight
F32
F32
[7168]
blk.29.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.29.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.30
blk.30.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.30.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.30.attn_kv_a_norm.weight
F32
F32
[512]
blk.30.attn_norm.weight
F32
F32
[7168]
blk.30.attn_output.weight
F16
F16
[16384, 7168]
blk.30.attn_q_a.weight
F16
F16
[7168, 1536]
blk.30.attn_q_a_norm.weight
F32
F32
[1536]
blk.30.attn_q_b.weight
F16
F16
[1536, 24576]
blk.30.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.30.exp_probs_b.bias
F32
F32
[256]
blk.30.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.30.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.30.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.30.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.30.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.30.ffn_norm.weight
F32
F32
[7168]
blk.30.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.30.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.31
blk.31.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.31.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.31.attn_kv_a_norm.weight
F32
F32
[512]
blk.31.attn_norm.weight
F32
F32
[7168]
blk.31.attn_output.weight
F16
F16
[16384, 7168]
blk.31.attn_q_a.weight
F16
F16
[7168, 1536]
blk.31.attn_q_a_norm.weight
F32
F32
[1536]
blk.31.attn_q_b.weight
F16
F16
[1536, 24576]
blk.31.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.31.exp_probs_b.bias
F32
F32
[256]
blk.31.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.31.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.31.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.31.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.31.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.31.ffn_norm.weight
F32
F32
[7168]
blk.31.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.31.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.32
blk.32.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.32.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.32.attn_kv_a_norm.weight
F32
F32
[512]
blk.32.attn_norm.weight
F32
F32
[7168]
blk.32.attn_output.weight
F16
F16
[16384, 7168]
blk.32.attn_q_a.weight
F16
F16
[7168, 1536]
blk.32.attn_q_a_norm.weight
F32
F32
[1536]
blk.32.attn_q_b.weight
F16
F16
[1536, 24576]
blk.32.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.32.exp_probs_b.bias
F32
F32
[256]
blk.32.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.32.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.32.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.32.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.32.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.32.ffn_norm.weight
F32
F32
[7168]
blk.32.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.32.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.33
blk.33.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.33.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.33.attn_kv_a_norm.weight
F32
F32
[512]
blk.33.attn_norm.weight
F32
F32
[7168]
blk.33.attn_output.weight
F16
F16
[16384, 7168]
blk.33.attn_q_a.weight
F16
F16
[7168, 1536]
blk.33.attn_q_a_norm.weight
F32
F32
[1536]
blk.33.attn_q_b.weight
F16
F16
[1536, 24576]
blk.33.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.33.exp_probs_b.bias
F32
F32
[256]
blk.33.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.33.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.33.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.33.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.33.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.33.ffn_norm.weight
F32
F32
[7168]
blk.33.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.33.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.34
blk.34.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.34.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.34.attn_kv_a_norm.weight
F32
F32
[512]
blk.34.attn_norm.weight
F32
F32
[7168]
blk.34.attn_output.weight
F16
F16
[16384, 7168]
blk.34.attn_q_a.weight
F16
F16
[7168, 1536]
blk.34.attn_q_a_norm.weight
F32
F32
[1536]
blk.34.attn_q_b.weight
F16
F16
[1536, 24576]
blk.34.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.34.exp_probs_b.bias
F32
F32
[256]
blk.34.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.34.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.34.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.34.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.34.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.34.ffn_norm.weight
F32
F32
[7168]
blk.34.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.34.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.35
blk.35.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.35.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.35.attn_kv_a_norm.weight
F32
F32
[512]
blk.35.attn_norm.weight
F32
F32
[7168]
blk.35.attn_output.weight
F16
F16
[16384, 7168]
blk.35.attn_q_a.weight
F16
F16
[7168, 1536]
blk.35.attn_q_a_norm.weight
F32
F32
[1536]
blk.35.attn_q_b.weight
F16
F16
[1536, 24576]
blk.35.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.35.exp_probs_b.bias
F32
F32
[256]
blk.35.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.35.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.35.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.35.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.35.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.35.ffn_norm.weight
F32
F32
[7168]
blk.35.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.35.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.36
blk.36.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.36.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.36.attn_kv_a_norm.weight
F32
F32
[512]
blk.36.attn_norm.weight
F32
F32
[7168]
blk.36.attn_output.weight
F16
F16
[16384, 7168]
blk.36.attn_q_a.weight
F16
F16
[7168, 1536]
blk.36.attn_q_a_norm.weight
F32
F32
[1536]
blk.36.attn_q_b.weight
F16
F16
[1536, 24576]
blk.36.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.36.exp_probs_b.bias
F32
F32
[256]
blk.36.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.36.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.36.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.36.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.36.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.36.ffn_norm.weight
F32
F32
[7168]
blk.36.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.36.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.37
blk.37.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.37.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.37.attn_kv_a_norm.weight
F32
F32
[512]
blk.37.attn_norm.weight
F32
F32
[7168]
blk.37.attn_output.weight
F16
F16
[16384, 7168]
blk.37.attn_q_a.weight
F16
F16
[7168, 1536]
blk.37.attn_q_a_norm.weight
F32
F32
[1536]
blk.37.attn_q_b.weight
F16
F16
[1536, 24576]
blk.37.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.37.exp_probs_b.bias
F32
F32
[256]
blk.37.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.37.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.37.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.37.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.37.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.37.ffn_norm.weight
F32
F32
[7168]
blk.37.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.37.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.38
blk.38.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.38.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.38.attn_kv_a_norm.weight
F32
F32
[512]
blk.38.attn_norm.weight
F32
F32
[7168]
blk.38.attn_output.weight
F16
F16
[16384, 7168]
blk.38.attn_q_a.weight
F16
F16
[7168, 1536]
blk.38.attn_q_a_norm.weight
F32
F32
[1536]
blk.38.attn_q_b.weight
F16
F16
[1536, 24576]
blk.38.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.38.exp_probs_b.bias
F32
F32
[256]
blk.38.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.38.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.38.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.38.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.38.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.38.ffn_norm.weight
F32
F32
[7168]
blk.38.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.38.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.39
blk.39.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.39.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.39.attn_kv_a_norm.weight
F32
F32
[512]
blk.39.attn_norm.weight
F32
F32
[7168]
blk.39.attn_output.weight
F16
F16
[16384, 7168]
blk.39.attn_q_a.weight
F16
F16
[7168, 1536]
blk.39.attn_q_a_norm.weight
F32
F32
[1536]
blk.39.attn_q_b.weight
F16
F16
[1536, 24576]
blk.39.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.39.exp_probs_b.bias
F32
F32
[256]
blk.39.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.39.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.39.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.39.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.39.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.39.ffn_norm.weight
F32
F32
[7168]
blk.39.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.39.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.40
blk.40.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.40.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.40.attn_kv_a_norm.weight
F32
F32
[512]
blk.40.attn_norm.weight
F32
F32
[7168]
blk.40.attn_output.weight
F16
F16
[16384, 7168]
blk.40.attn_q_a.weight
F16
F16
[7168, 1536]
blk.40.attn_q_a_norm.weight
F32
F32
[1536]
blk.40.attn_q_b.weight
F16
F16
[1536, 24576]
blk.40.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.40.exp_probs_b.bias
F32
F32
[256]
blk.40.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.40.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.40.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.40.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.40.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.40.ffn_norm.weight
F32
F32
[7168]
blk.40.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.40.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.41
blk.41.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.41.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.41.attn_kv_a_norm.weight
F32
F32
[512]
blk.41.attn_norm.weight
F32
F32
[7168]
blk.41.attn_output.weight
F16
F16
[16384, 7168]
blk.41.attn_q_a.weight
F16
F16
[7168, 1536]
blk.41.attn_q_a_norm.weight
F32
F32
[1536]
blk.41.attn_q_b.weight
F16
F16
[1536, 24576]
blk.41.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.41.exp_probs_b.bias
F32
F32
[256]
blk.41.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.41.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.41.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.41.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.41.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.41.ffn_norm.weight
F32
F32
[7168]
blk.41.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.41.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.42
blk.42.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.42.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.42.attn_kv_a_norm.weight
F32
F32
[512]
blk.42.attn_norm.weight
F32
F32
[7168]
blk.42.attn_output.weight
F16
F16
[16384, 7168]
blk.42.attn_q_a.weight
F16
F16
[7168, 1536]
blk.42.attn_q_a_norm.weight
F32
F32
[1536]
blk.42.attn_q_b.weight
F16
F16
[1536, 24576]
blk.42.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.42.exp_probs_b.bias
F32
F32
[256]
blk.42.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.42.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.42.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.42.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.42.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.42.ffn_norm.weight
F32
F32
[7168]
blk.42.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.42.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.43
blk.43.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.43.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.43.attn_kv_a_norm.weight
F32
F32
[512]
blk.43.attn_norm.weight
F32
F32
[7168]
blk.43.attn_output.weight
F16
F16
[16384, 7168]
blk.43.attn_q_a.weight
F16
F16
[7168, 1536]
blk.43.attn_q_a_norm.weight
F32
F32
[1536]
blk.43.attn_q_b.weight
F16
F16
[1536, 24576]
blk.43.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.43.exp_probs_b.bias
F32
F32
[256]
blk.43.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.43.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.43.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.43.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.43.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.43.ffn_norm.weight
F32
F32
[7168]
blk.43.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.43.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.44
blk.44.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.44.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.44.attn_kv_a_norm.weight
F32
F32
[512]
blk.44.attn_norm.weight
F32
F32
[7168]
blk.44.attn_output.weight
F16
F16
[16384, 7168]
blk.44.attn_q_a.weight
F16
F16
[7168, 1536]
blk.44.attn_q_a_norm.weight
F32
F32
[1536]
blk.44.attn_q_b.weight
F16
F16
[1536, 24576]
blk.44.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.44.exp_probs_b.bias
F32
F32
[256]
blk.44.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.44.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.44.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.44.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.44.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.44.ffn_norm.weight
F32
F32
[7168]
blk.44.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.44.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.45
blk.45.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.45.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.45.attn_kv_a_norm.weight
F32
F32
[512]
blk.45.attn_norm.weight
F32
F32
[7168]
blk.45.attn_output.weight
F16
F16
[16384, 7168]
blk.45.attn_q_a.weight
F16
F16
[7168, 1536]
blk.45.attn_q_a_norm.weight
F32
F32
[1536]
blk.45.attn_q_b.weight
F16
F16
[1536, 24576]
blk.45.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.45.exp_probs_b.bias
F32
F32
[256]
blk.45.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.45.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.45.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.45.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.45.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.45.ffn_norm.weight
F32
F32
[7168]
blk.45.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.45.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.46
blk.46.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.46.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.46.attn_kv_a_norm.weight
F32
F32
[512]
blk.46.attn_norm.weight
F32
F32
[7168]
blk.46.attn_output.weight
F16
F16
[16384, 7168]
blk.46.attn_q_a.weight
F16
F16
[7168, 1536]
blk.46.attn_q_a_norm.weight
F32
F32
[1536]
blk.46.attn_q_b.weight
F16
F16
[1536, 24576]
blk.46.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.46.exp_probs_b.bias
F32
F32
[256]
blk.46.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.46.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.46.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.46.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.46.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.46.ffn_norm.weight
F32
F32
[7168]
blk.46.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.46.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.47
blk.47.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.47.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.47.attn_kv_a_norm.weight
F32
F32
[512]
blk.47.attn_norm.weight
F32
F32
[7168]
blk.47.attn_output.weight
F16
F16
[16384, 7168]
blk.47.attn_q_a.weight
F16
F16
[7168, 1536]
blk.47.attn_q_a_norm.weight
F32
F32
[1536]
blk.47.attn_q_b.weight
F16
F16
[1536, 24576]
blk.47.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.47.exp_probs_b.bias
F32
F32
[256]
blk.47.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.47.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.47.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.47.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.47.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.47.ffn_norm.weight
F32
F32
[7168]
blk.47.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.47.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.48
blk.48.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.48.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.48.attn_kv_a_norm.weight
F32
F32
[512]
blk.48.attn_norm.weight
F32
F32
[7168]
blk.48.attn_output.weight
F16
F16
[16384, 7168]
blk.48.attn_q_a.weight
F16
F16
[7168, 1536]
blk.48.attn_q_a_norm.weight
F32
F32
[1536]
blk.48.attn_q_b.weight
F16
F16
[1536, 24576]
blk.48.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.48.exp_probs_b.bias
F32
F32
[256]
blk.48.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.48.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.48.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.48.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.48.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.48.ffn_norm.weight
F32
F32
[7168]
blk.48.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.48.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.49
blk.49.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.49.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.49.attn_kv_a_norm.weight
F32
F32
[512]
blk.49.attn_norm.weight
F32
F32
[7168]
blk.49.attn_output.weight
F16
F16
[16384, 7168]
blk.49.attn_q_a.weight
F16
F16
[7168, 1536]
blk.49.attn_q_a_norm.weight
F32
F32
[1536]
blk.49.attn_q_b.weight
F16
F16
[1536, 24576]
blk.49.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.49.exp_probs_b.bias
F32
F32
[256]
blk.49.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.49.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.49.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.49.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.49.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.49.ffn_norm.weight
F32
F32
[7168]
blk.49.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.49.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.50
blk.50.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.50.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.50.attn_kv_a_norm.weight
F32
F32
[512]
blk.50.attn_norm.weight
F32
F32
[7168]
blk.50.attn_output.weight
F16
F16
[16384, 7168]
blk.50.attn_q_a.weight
F16
F16
[7168, 1536]
blk.50.attn_q_a_norm.weight
F32
F32
[1536]
blk.50.attn_q_b.weight
F16
F16
[1536, 24576]
blk.50.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.50.exp_probs_b.bias
F32
F32
[256]
blk.50.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.50.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.50.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.50.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.50.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.50.ffn_norm.weight
F32
F32
[7168]
blk.50.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.50.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.51
blk.51.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.51.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.51.attn_kv_a_norm.weight
F32
F32
[512]
blk.51.attn_norm.weight
F32
F32
[7168]
blk.51.attn_output.weight
F16
F16
[16384, 7168]
blk.51.attn_q_a.weight
F16
F16
[7168, 1536]
blk.51.attn_q_a_norm.weight
F32
F32
[1536]
blk.51.attn_q_b.weight
F16
F16
[1536, 24576]
blk.51.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.51.exp_probs_b.bias
F32
F32
[256]
blk.51.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.51.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.51.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.51.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.51.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.51.ffn_norm.weight
F32
F32
[7168]
blk.51.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.51.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.52
blk.52.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.52.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.52.attn_kv_a_norm.weight
F32
F32
[512]
blk.52.attn_norm.weight
F32
F32
[7168]
blk.52.attn_output.weight
F16
F16
[16384, 7168]
blk.52.attn_q_a.weight
F16
F16
[7168, 1536]
blk.52.attn_q_a_norm.weight
F32
F32
[1536]
blk.52.attn_q_b.weight
F16
F16
[1536, 24576]
blk.52.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.52.exp_probs_b.bias
F32
F32
[256]
blk.52.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.52.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.52.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.52.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.52.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.52.ffn_norm.weight
F32
F32
[7168]
blk.52.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.52.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.53
blk.53.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.53.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.53.attn_kv_a_norm.weight
F32
F32
[512]
blk.53.attn_norm.weight
F32
F32
[7168]
blk.53.attn_output.weight
F16
F16
[16384, 7168]
blk.53.attn_q_a.weight
F16
F16
[7168, 1536]
blk.53.attn_q_a_norm.weight
F32
F32
[1536]
blk.53.attn_q_b.weight
F16
F16
[1536, 24576]
blk.53.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.53.exp_probs_b.bias
F32
F32
[256]
blk.53.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.53.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.53.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.53.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.53.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.53.ffn_norm.weight
F32
F32
[7168]
blk.53.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.53.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.54
blk.54.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.54.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.54.attn_kv_a_norm.weight
F32
F32
[512]
blk.54.attn_norm.weight
F32
F32
[7168]
blk.54.attn_output.weight
F16
F16
[16384, 7168]
blk.54.attn_q_a.weight
F16
F16
[7168, 1536]
blk.54.attn_q_a_norm.weight
F32
F32
[1536]
blk.54.attn_q_b.weight
F16
F16
[1536, 24576]
blk.54.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.54.exp_probs_b.bias
F32
F32
[256]
blk.54.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.54.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.54.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.54.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.54.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.54.ffn_norm.weight
F32
F32
[7168]
blk.54.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.54.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.55
blk.55.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.55.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.55.attn_kv_a_norm.weight
F32
F32
[512]
blk.55.attn_norm.weight
F32
F32
[7168]
blk.55.attn_output.weight
F16
F16
[16384, 7168]
blk.55.attn_q_a.weight
F16
F16
[7168, 1536]
blk.55.attn_q_a_norm.weight
F32
F32
[1536]
blk.55.attn_q_b.weight
F16
F16
[1536, 24576]
blk.55.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.55.exp_probs_b.bias
F32
F32
[256]
blk.55.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.55.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.55.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.55.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.55.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.55.ffn_norm.weight
F32
F32
[7168]
blk.55.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.55.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.56
blk.56.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.56.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.56.attn_kv_a_norm.weight
F32
F32
[512]
blk.56.attn_norm.weight
F32
F32
[7168]
blk.56.attn_output.weight
F16
F16
[16384, 7168]
blk.56.attn_q_a.weight
F16
F16
[7168, 1536]
blk.56.attn_q_a_norm.weight
F32
F32
[1536]
blk.56.attn_q_b.weight
F16
F16
[1536, 24576]
blk.56.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.56.exp_probs_b.bias
F32
F32
[256]
blk.56.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.56.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.56.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.56.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.56.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.56.ffn_norm.weight
F32
F32
[7168]
blk.56.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.56.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.57
blk.57.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.57.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.57.attn_kv_a_norm.weight
F32
F32
[512]
blk.57.attn_norm.weight
F32
F32
[7168]
blk.57.attn_output.weight
F16
F16
[16384, 7168]
blk.57.attn_q_a.weight
F16
F16
[7168, 1536]
blk.57.attn_q_a_norm.weight
F32
F32
[1536]
blk.57.attn_q_b.weight
F16
F16
[1536, 24576]
blk.57.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.57.exp_probs_b.bias
F32
F32
[256]
blk.57.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.57.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.57.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.57.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.57.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.57.ffn_norm.weight
F32
F32
[7168]
blk.57.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.57.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.58
blk.58.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.58.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.58.attn_kv_a_norm.weight
F32
F32
[512]
blk.58.attn_norm.weight
F32
F32
[7168]
blk.58.attn_output.weight
F16
F16
[16384, 7168]
blk.58.attn_q_a.weight
F16
F16
[7168, 1536]
blk.58.attn_q_a_norm.weight
F32
F32
[1536]
blk.58.attn_q_b.weight
F16
F16
[1536, 24576]
blk.58.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.58.exp_probs_b.bias
F32
F32
[256]
blk.58.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.58.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.58.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.58.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.58.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.58.ffn_norm.weight
F32
F32
[7168]
blk.58.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.58.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.59
blk.59.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.59.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.59.attn_kv_a_norm.weight
F32
F32
[512]
blk.59.attn_norm.weight
F32
F32
[7168]
blk.59.attn_output.weight
F16
F16
[16384, 7168]
blk.59.attn_q_a.weight
F16
F16
[7168, 1536]
blk.59.attn_q_a_norm.weight
F32
F32
[1536]
blk.59.attn_q_b.weight
F16
F16
[1536, 24576]
blk.59.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.59.exp_probs_b.bias
F32
F32
[256]
blk.59.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.59.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.59.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.59.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.59.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.59.ffn_norm.weight
F32
F32
[7168]
blk.59.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.59.ffn_up_shexp.weight
F16
F16
[7168, 2048]
blk.60
blk.60.attn_k_b.weight
F16
F16
[128, 512, 128]
blk.60.attn_kv_a_mqa.weight
F16
F16
[7168, 576]
blk.60.attn_kv_a_norm.weight
F32
F32
[512]
blk.60.attn_norm.weight
F32
F32
[7168]
blk.60.attn_output.weight
F16
F16
[16384, 7168]
blk.60.attn_q_a.weight
F16
F16
[7168, 1536]
blk.60.attn_q_a_norm.weight
F32
F32
[1536]
blk.60.attn_q_b.weight
F16
F16
[1536, 24576]
blk.60.attn_v_b.weight
F16
F16
[512, 128, 128]
blk.60.exp_probs_b.bias
F32
F32
[256]
blk.60.ffn_down_exps.weight
F16
F16
[2048, 7168, 256]
blk.60.ffn_down_shexp.weight
F16
F16
[2048, 7168]
blk.60.ffn_gate_exps.weight
F16
F16
[7168, 2048, 256]
blk.60.ffn_gate_inp.weight
F32
F32
[7168, 256]
blk.60.ffn_gate_shexp.weight
F16
F16
[7168, 2048]
blk.60.ffn_norm.weight
F32
F32
[7168]
blk.60.ffn_up_exps.weight
F16
F16
[7168, 2048, 256]
blk.60.ffn_up_shexp.weight
F16
F16
[7168, 2048]
output.weight
F16
F16
[7168, 128815]
output_norm.weight
F32
F32
[7168]