🪐 A family of small models with 135M, 360M, and 1.7B parameters, trained on a new high-quality dataset.
135m
360m
1.7b
82.6K Pulls Updated 2 months ago
88371a6f7399 · 219MB
-
general.architecturellama
-
general.basenameSmolLM
-
general.datasets[HuggingFaceTB/smollm-corpus]
-
general.file_type11
-
general.languages[en]
-
general.licenseapache-2.0
-
general.nameSmolLM 360M
-
general.quantization_version2
-
general.size_label360M
-
general.typemodel
-
llama.attention.head_count15
-
llama.attention.head_count_kv5
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count32
-
llama.context_length2048
-
llama.embedding_length960
-
llama.feed_forward_length2560
-
llama.rope.dimension_count64
-
llama.rope.freq_base10000
-
llama.vocab_size49152
-
tokenizer.ggml.add_bos_tokenfalse
-
tokenizer.ggml.add_space_prefixfalse
-
tokenizer.ggml.bos_token_id0
-
tokenizer.ggml.eos_token_id0
-
tokenizer.ggml.merges[Ġ t, Ġ a, i n, h e, Ġ Ġ, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.presmollm
-
tokenizer.ggml.token_type[3, 3, 3, 3, 3, ...]
-
tokenizer.ggml.tokens[<|endoftext|>, <|im_start|>, <|im_end|>, <repo_name>, <reponame>, ...]
-
tokenizer.ggml.unknown_token_id0
-
NameTypeShape
-
token_embd.weightQ8_0[960, 49152]
-
blk.0.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.0.attn_norm.weightF32[960]
-
blk.0.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.0.ffn_down.weightQ3_K[2560, 960]
-
blk.0.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.0.ffn_norm.weightF32[960]
-
blk.0.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.1.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.1.attn_norm.weightF32[960]
-
blk.1.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.1.ffn_down.weightQ3_K[2560, 960]
-
blk.1.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.1.ffn_norm.weightF32[960]
-
blk.1.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.2.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.2.attn_norm.weightF32[960]
-
blk.2.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.2.ffn_down.weightQ3_K[2560, 960]
-
blk.2.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.2.ffn_norm.weightF32[960]
-
blk.2.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.3.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.3.attn_norm.weightF32[960]
-
blk.3.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.3.ffn_down.weightQ3_K[2560, 960]
-
blk.3.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.3.ffn_norm.weightF32[960]
-
blk.3.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.4.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.4.attn_norm.weightF32[960]
-
blk.4.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.4.ffn_down.weightQ3_K[2560, 960]
-
blk.4.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.4.ffn_norm.weightF32[960]
-
blk.4.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.5.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.5.attn_norm.weightF32[960]
-
blk.5.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.5.ffn_down.weightQ3_K[2560, 960]
-
blk.5.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.5.ffn_norm.weightF32[960]
-
blk.5.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.6.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.6.attn_norm.weightF32[960]
-
blk.6.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.6.ffn_down.weightQ3_K[2560, 960]
-
blk.6.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.6.ffn_norm.weightF32[960]
-
blk.6.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.7.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.7.attn_norm.weightF32[960]
-
blk.7.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.7.ffn_down.weightQ3_K[2560, 960]
-
blk.7.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.7.ffn_norm.weightF32[960]
-
blk.7.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.8.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.8.attn_norm.weightF32[960]
-
blk.8.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.8.ffn_down.weightQ3_K[2560, 960]
-
blk.8.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.8.ffn_norm.weightF32[960]
-
blk.8.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.9.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.9.attn_norm.weightF32[960]
-
blk.9.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.9.ffn_down.weightQ3_K[2560, 960]
-
blk.9.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.9.ffn_norm.weightF32[960]
-
blk.9.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.10.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.10.attn_norm.weightF32[960]
-
blk.10.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.10.ffn_down.weightQ3_K[2560, 960]
-
blk.10.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.10.ffn_norm.weightF32[960]
-
blk.10.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.11.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.11.attn_norm.weightF32[960]
-
blk.11.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.11.ffn_down.weightQ3_K[2560, 960]
-
blk.11.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.11.ffn_norm.weightF32[960]
-
blk.11.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.12.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.12.attn_norm.weightF32[960]
-
blk.12.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.12.ffn_down.weightQ3_K[2560, 960]
-
blk.12.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.12.ffn_norm.weightF32[960]
-
blk.12.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.13.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.13.attn_norm.weightF32[960]
-
blk.13.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.13.ffn_down.weightQ3_K[2560, 960]
-
blk.13.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.13.ffn_norm.weightF32[960]
-
blk.13.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.14.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.14.attn_norm.weightF32[960]
-
blk.14.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.14.ffn_down.weightQ3_K[2560, 960]
-
blk.14.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.14.ffn_norm.weightF32[960]
-
blk.14.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.15.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.15.attn_norm.weightF32[960]
-
blk.15.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.15.ffn_down.weightQ3_K[2560, 960]
-
blk.15.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.15.ffn_norm.weightF32[960]
-
blk.15.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.16.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.16.attn_norm.weightF32[960]
-
blk.16.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.16.ffn_down.weightQ3_K[2560, 960]
-
blk.16.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.16.ffn_norm.weightF32[960]
-
blk.16.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.17.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.17.attn_norm.weightF32[960]
-
blk.17.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.17.ffn_down.weightQ3_K[2560, 960]
-
blk.17.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.17.ffn_norm.weightF32[960]
-
blk.17.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.18.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.18.attn_norm.weightF32[960]
-
blk.18.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.18.ffn_down.weightQ3_K[2560, 960]
-
blk.18.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.18.ffn_norm.weightF32[960]
-
blk.18.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.19.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.19.attn_norm.weightF32[960]
-
blk.19.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.19.ffn_down.weightQ3_K[2560, 960]
-
blk.19.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.19.ffn_norm.weightF32[960]
-
blk.19.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.20.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.20.attn_norm.weightF32[960]
-
blk.20.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.20.ffn_down.weightQ3_K[2560, 960]
-
blk.20.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.20.ffn_norm.weightF32[960]
-
blk.20.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.21.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.21.attn_norm.weightF32[960]
-
blk.21.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.21.ffn_down.weightQ3_K[2560, 960]
-
blk.21.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.21.ffn_norm.weightF32[960]
-
blk.21.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.22.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.22.attn_norm.weightF32[960]
-
blk.22.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.22.ffn_down.weightQ3_K[2560, 960]
-
blk.22.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.22.ffn_norm.weightF32[960]
-
blk.22.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.23.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.23.attn_norm.weightF32[960]
-
blk.23.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.23.ffn_down.weightQ3_K[2560, 960]
-
blk.23.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.23.ffn_norm.weightF32[960]
-
blk.23.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.24.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.24.attn_norm.weightF32[960]
-
blk.24.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.24.ffn_down.weightQ3_K[2560, 960]
-
blk.24.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.24.ffn_norm.weightF32[960]
-
blk.24.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.25.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.25.attn_norm.weightF32[960]
-
blk.25.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.25.ffn_down.weightQ3_K[2560, 960]
-
blk.25.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.25.ffn_norm.weightF32[960]
-
blk.25.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.26.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.26.attn_norm.weightF32[960]
-
blk.26.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.26.ffn_down.weightQ3_K[2560, 960]
-
blk.26.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.26.ffn_norm.weightF32[960]
-
blk.26.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.27.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.27.attn_norm.weightF32[960]
-
blk.27.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.27.ffn_down.weightQ3_K[2560, 960]
-
blk.27.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.27.ffn_norm.weightF32[960]
-
blk.27.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.28.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.28.attn_norm.weightF32[960]
-
blk.28.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.28.ffn_down.weightQ3_K[2560, 960]
-
blk.28.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.28.ffn_norm.weightF32[960]
-
blk.28.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.29.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.29.attn_norm.weightF32[960]
-
blk.29.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.29.ffn_down.weightQ3_K[2560, 960]
-
blk.29.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.29.ffn_norm.weightF32[960]
-
blk.29.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.30.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.30.attn_norm.weightF32[960]
-
blk.30.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.30.ffn_down.weightQ3_K[2560, 960]
-
blk.30.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.30.ffn_norm.weightF32[960]
-
blk.30.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.31.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.31.attn_norm.weightF32[960]
-
blk.31.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.31.ffn_down.weightQ3_K[2560, 960]
-
blk.31.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.31.ffn_norm.weightF32[960]
-
blk.31.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
output_norm.weightF32[960]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29
blk.30
blk.31