SmolLM with Unsloth
135m
360m
14 Pulls Updated 5 weeks ago
85720b7b3d8d · 219MB
-
general.architecturellama
-
general.file_type26
-
general.quantization_version2
-
llama.attention.head_count15
-
llama.attention.head_count_kv5
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count32
-
llama.context_length2048
-
llama.embedding_length960
-
llama.feed_forward_length2560
-
llama.rope.dimension_count64
-
llama.rope.freq_base10000
-
llama.vocab_size49152
-
tokenizer.ggml.add_bos_tokenfalse
-
tokenizer.ggml.add_eos_tokenfalse
-
tokenizer.ggml.add_padding_tokenfalse
-
tokenizer.ggml.add_unknown_tokenfalse
-
tokenizer.ggml.bos_token_id0
-
tokenizer.ggml.eos_token_id0
-
tokenizer.ggml.merges[Ġ t, Ġ a, i n, h e, Ġ Ġ, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.padding_token_id16
-
tokenizer.ggml.predefault
-
tokenizer.ggml.scores[0, 1, 2, 3, 4, ...]
-
tokenizer.ggml.token_type[3, 3, 3, 3, 3, ...]
-
tokenizer.ggml.tokens[<|endoftext|>, <|im_start|>, <|im_end|>, <repo_name>, <reponame>, ...]
-
tokenizer.ggml.unknown_token_id0
-
NameTypeShape
-
token_embd.weightQ8_0[960, 49152]
-
blk.0.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.0.attn_norm.weightF32[960]
-
blk.0.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.0.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.0.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.0.ffn_norm.weightF32[960]
-
blk.0.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.1.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.1.attn_norm.weightF32[960]
-
blk.1.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.1.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.1.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.1.ffn_norm.weightF32[960]
-
blk.1.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.2.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.2.attn_norm.weightF32[960]
-
blk.2.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.2.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.2.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.2.ffn_norm.weightF32[960]
-
blk.2.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.3.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.3.attn_norm.weightF32[960]
-
blk.3.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.3.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.3.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.3.ffn_norm.weightF32[960]
-
blk.3.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.4.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.4.attn_norm.weightF32[960]
-
blk.4.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.4.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.4.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.4.ffn_norm.weightF32[960]
-
blk.4.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.5.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.5.attn_norm.weightF32[960]
-
blk.5.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.5.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.5.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.5.ffn_norm.weightF32[960]
-
blk.5.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.6.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.6.attn_norm.weightF32[960]
-
blk.6.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.6.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.6.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.6.ffn_norm.weightF32[960]
-
blk.6.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.7.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.7.attn_norm.weightF32[960]
-
blk.7.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.7.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.7.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.7.ffn_norm.weightF32[960]
-
blk.7.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.8.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.8.attn_norm.weightF32[960]
-
blk.8.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.8.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.8.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.8.ffn_norm.weightF32[960]
-
blk.8.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.9.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.9.attn_norm.weightF32[960]
-
blk.9.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.9.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.9.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.9.ffn_norm.weightF32[960]
-
blk.9.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.10.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.10.attn_norm.weightF32[960]
-
blk.10.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.10.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.10.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.10.ffn_norm.weightF32[960]
-
blk.10.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.11.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.11.attn_norm.weightF32[960]
-
blk.11.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.11.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.11.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.11.ffn_norm.weightF32[960]
-
blk.11.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.12.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.12.attn_norm.weightF32[960]
-
blk.12.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.12.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.12.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.12.ffn_norm.weightF32[960]
-
blk.12.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.13.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.13.attn_norm.weightF32[960]
-
blk.13.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.13.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.13.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.13.ffn_norm.weightF32[960]
-
blk.13.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.14.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.14.attn_norm.weightF32[960]
-
blk.14.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.14.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.14.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.14.ffn_norm.weightF32[960]
-
blk.14.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.15.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.15.attn_norm.weightF32[960]
-
blk.15.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.15.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.15.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.15.ffn_norm.weightF32[960]
-
blk.15.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.16.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.16.attn_norm.weightF32[960]
-
blk.16.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.16.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.16.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.16.ffn_norm.weightF32[960]
-
blk.16.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.17.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.17.attn_norm.weightF32[960]
-
blk.17.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.17.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.17.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.17.ffn_norm.weightF32[960]
-
blk.17.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.18.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.18.attn_norm.weightF32[960]
-
blk.18.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.18.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.18.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.18.ffn_norm.weightF32[960]
-
blk.18.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.19.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.19.attn_norm.weightF32[960]
-
blk.19.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.19.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.19.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.19.ffn_norm.weightF32[960]
-
blk.19.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.20.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.20.attn_norm.weightF32[960]
-
blk.20.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.20.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.20.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.20.ffn_norm.weightF32[960]
-
blk.20.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.21.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.21.attn_norm.weightF32[960]
-
blk.21.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.21.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.21.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.21.ffn_norm.weightF32[960]
-
blk.21.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.22.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.22.attn_norm.weightF32[960]
-
blk.22.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.22.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.22.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.22.ffn_norm.weightF32[960]
-
blk.22.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.23.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.23.attn_norm.weightF32[960]
-
blk.23.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.23.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.23.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.23.ffn_norm.weightF32[960]
-
blk.23.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.24.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.24.attn_norm.weightF32[960]
-
blk.24.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.24.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.24.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.24.ffn_norm.weightF32[960]
-
blk.24.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.25.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.25.attn_norm.weightF32[960]
-
blk.25.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.25.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.25.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.25.ffn_norm.weightF32[960]
-
blk.25.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.26.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.26.attn_norm.weightF32[960]
-
blk.26.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.26.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.26.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.26.ffn_norm.weightF32[960]
-
blk.26.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.27.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.27.attn_norm.weightF32[960]
-
blk.27.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.27.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.27.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.27.ffn_norm.weightF32[960]
-
blk.27.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.28.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.28.attn_norm.weightF32[960]
-
blk.28.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.28.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.28.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.28.ffn_norm.weightF32[960]
-
blk.28.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.29.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.29.attn_norm.weightF32[960]
-
blk.29.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.29.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.29.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.29.ffn_norm.weightF32[960]
-
blk.29.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.30.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.30.attn_norm.weightF32[960]
-
blk.30.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.30.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.30.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.30.ffn_norm.weightF32[960]
-
blk.30.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.31.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.31.attn_norm.weightF32[960]
-
blk.31.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.31.ffn_down.weight(!unknown_type 21!)[2560, 960]
-
blk.31.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.31.ffn_norm.weightF32[960]
-
blk.31.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
output_norm.weightF32[960]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29
blk.30
blk.31