🪐 A family of small models with 135M, 360M, and 1.7B parameters, trained on a new high-quality dataset.
135m
360m
1.7b
96.4K Pulls Updated 3 months ago
fcd748b951bc · 235MB
-
general.architecturellama
-
general.basenameSmolLM
-
general.datasets[HuggingFaceTB/smollm-corpus]
-
general.file_type12
-
general.languages[en]
-
general.licenseapache-2.0
-
general.nameSmolLM 360M
-
general.quantization_version2
-
general.size_label360M
-
general.typemodel
-
llama.attention.head_count15
-
llama.attention.head_count_kv5
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count32
-
llama.context_length2048
-
llama.embedding_length960
-
llama.feed_forward_length2560
-
llama.rope.dimension_count64
-
llama.rope.freq_base10000
-
llama.vocab_size49152
-
tokenizer.ggml.add_bos_tokenfalse
-
tokenizer.ggml.add_space_prefixfalse
-
tokenizer.ggml.bos_token_id0
-
tokenizer.ggml.eos_token_id0
-
tokenizer.ggml.merges[Ġ t, Ġ a, i n, h e, Ġ Ġ, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.presmollm
-
tokenizer.ggml.token_type[3, 3, 3, 3, 3, ...]
-
tokenizer.ggml.tokens[<|endoftext|>, <|im_start|>, <|im_end|>, <repo_name>, <reponame>, ...]
-
tokenizer.ggml.unknown_token_id0
-
NameTypeShape
-
token_embd.weightQ8_0[960, 49152]
-
blk.0.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.0.attn_norm.weightF32[960]
-
blk.0.attn_output.weightQ5_0[960, 960]
-
blk.0.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_v.weightQ5_1[960, 320]
-
blk.0.ffn_down.weightQ5_K[2560, 960]
-
blk.0.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.0.ffn_norm.weightF32[960]
-
blk.0.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.1.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.1.attn_norm.weightF32[960]
-
blk.1.attn_output.weightQ5_0[960, 960]
-
blk.1.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_v.weightQ5_1[960, 320]
-
blk.1.ffn_down.weightQ5_K[2560, 960]
-
blk.1.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.1.ffn_norm.weightF32[960]
-
blk.1.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.2.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.2.attn_norm.weightF32[960]
-
blk.2.attn_output.weightQ5_0[960, 960]
-
blk.2.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_v.weightQ5_0[960, 320]
-
blk.2.ffn_down.weightQ4_K[2560, 960]
-
blk.2.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.2.ffn_norm.weightF32[960]
-
blk.2.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.3.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.3.attn_norm.weightF32[960]
-
blk.3.attn_output.weightQ5_0[960, 960]
-
blk.3.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_v.weightQ5_0[960, 320]
-
blk.3.ffn_down.weightQ4_K[2560, 960]
-
blk.3.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.3.ffn_norm.weightF32[960]
-
blk.3.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.4.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.4.attn_norm.weightF32[960]
-
blk.4.attn_output.weightQ5_0[960, 960]
-
blk.4.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_v.weightQ5_0[960, 320]
-
blk.4.ffn_down.weightQ4_K[2560, 960]
-
blk.4.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.4.ffn_norm.weightF32[960]
-
blk.4.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.5.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.5.attn_norm.weightF32[960]
-
blk.5.attn_output.weightQ5_0[960, 960]
-
blk.5.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_v.weightQ5_0[960, 320]
-
blk.5.ffn_down.weightQ4_K[2560, 960]
-
blk.5.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.5.ffn_norm.weightF32[960]
-
blk.5.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.6.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.6.attn_norm.weightF32[960]
-
blk.6.attn_output.weightQ5_0[960, 960]
-
blk.6.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_v.weightQ5_0[960, 320]
-
blk.6.ffn_down.weightQ4_K[2560, 960]
-
blk.6.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.6.ffn_norm.weightF32[960]
-
blk.6.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.7.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.7.attn_norm.weightF32[960]
-
blk.7.attn_output.weightQ5_0[960, 960]
-
blk.7.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_v.weightQ5_0[960, 320]
-
blk.7.ffn_down.weightQ4_K[2560, 960]
-
blk.7.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.7.ffn_norm.weightF32[960]
-
blk.7.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.8.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.8.attn_norm.weightF32[960]
-
blk.8.attn_output.weightQ5_0[960, 960]
-
blk.8.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_v.weightQ5_0[960, 320]
-
blk.8.ffn_down.weightQ4_K[2560, 960]
-
blk.8.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.8.ffn_norm.weightF32[960]
-
blk.8.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.9.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.9.attn_norm.weightF32[960]
-
blk.9.attn_output.weightQ5_0[960, 960]
-
blk.9.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_v.weightQ5_0[960, 320]
-
blk.9.ffn_down.weightQ4_K[2560, 960]
-
blk.9.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.9.ffn_norm.weightF32[960]
-
blk.9.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.10.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.10.attn_norm.weightF32[960]
-
blk.10.attn_output.weightQ5_0[960, 960]
-
blk.10.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_v.weightQ5_0[960, 320]
-
blk.10.ffn_down.weightQ4_K[2560, 960]
-
blk.10.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.10.ffn_norm.weightF32[960]
-
blk.10.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.11.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.11.attn_norm.weightF32[960]
-
blk.11.attn_output.weightQ5_0[960, 960]
-
blk.11.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_v.weightQ5_0[960, 320]
-
blk.11.ffn_down.weightQ4_K[2560, 960]
-
blk.11.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.11.ffn_norm.weightF32[960]
-
blk.11.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.12.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.12.attn_norm.weightF32[960]
-
blk.12.attn_output.weightQ5_0[960, 960]
-
blk.12.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_v.weightQ5_0[960, 320]
-
blk.12.ffn_down.weightQ4_K[2560, 960]
-
blk.12.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.12.ffn_norm.weightF32[960]
-
blk.12.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.13.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.13.attn_norm.weightF32[960]
-
blk.13.attn_output.weightQ5_0[960, 960]
-
blk.13.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_v.weightQ5_0[960, 320]
-
blk.13.ffn_down.weightQ4_K[2560, 960]
-
blk.13.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.13.ffn_norm.weightF32[960]
-
blk.13.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.14.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.14.attn_norm.weightF32[960]
-
blk.14.attn_output.weightQ5_0[960, 960]
-
blk.14.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_v.weightQ5_0[960, 320]
-
blk.14.ffn_down.weightQ4_K[2560, 960]
-
blk.14.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.14.ffn_norm.weightF32[960]
-
blk.14.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.15.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.15.attn_norm.weightF32[960]
-
blk.15.attn_output.weightQ5_0[960, 960]
-
blk.15.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_v.weightQ5_0[960, 320]
-
blk.15.ffn_down.weightQ4_K[2560, 960]
-
blk.15.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.15.ffn_norm.weightF32[960]
-
blk.15.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.16.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.16.attn_norm.weightF32[960]
-
blk.16.attn_output.weightQ5_0[960, 960]
-
blk.16.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_v.weightQ5_0[960, 320]
-
blk.16.ffn_down.weightQ4_K[2560, 960]
-
blk.16.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.16.ffn_norm.weightF32[960]
-
blk.16.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.17.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.17.attn_norm.weightF32[960]
-
blk.17.attn_output.weightQ5_0[960, 960]
-
blk.17.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_v.weightQ5_0[960, 320]
-
blk.17.ffn_down.weightQ4_K[2560, 960]
-
blk.17.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.17.ffn_norm.weightF32[960]
-
blk.17.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.18.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.18.attn_norm.weightF32[960]
-
blk.18.attn_output.weightQ5_0[960, 960]
-
blk.18.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_v.weightQ5_0[960, 320]
-
blk.18.ffn_down.weightQ4_K[2560, 960]
-
blk.18.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.18.ffn_norm.weightF32[960]
-
blk.18.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.19.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.19.attn_norm.weightF32[960]
-
blk.19.attn_output.weightQ5_0[960, 960]
-
blk.19.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_v.weightQ5_0[960, 320]
-
blk.19.ffn_down.weightQ4_K[2560, 960]
-
blk.19.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.19.ffn_norm.weightF32[960]
-
blk.19.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.20.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.20.attn_norm.weightF32[960]
-
blk.20.attn_output.weightQ5_0[960, 960]
-
blk.20.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_v.weightQ5_0[960, 320]
-
blk.20.ffn_down.weightQ4_K[2560, 960]
-
blk.20.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.20.ffn_norm.weightF32[960]
-
blk.20.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.21.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.21.attn_norm.weightF32[960]
-
blk.21.attn_output.weightQ5_0[960, 960]
-
blk.21.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_v.weightQ5_0[960, 320]
-
blk.21.ffn_down.weightQ4_K[2560, 960]
-
blk.21.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.21.ffn_norm.weightF32[960]
-
blk.21.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.22.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.22.attn_norm.weightF32[960]
-
blk.22.attn_output.weightQ5_0[960, 960]
-
blk.22.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_v.weightQ5_0[960, 320]
-
blk.22.ffn_down.weightQ4_K[2560, 960]
-
blk.22.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.22.ffn_norm.weightF32[960]
-
blk.22.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.23.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.23.attn_norm.weightF32[960]
-
blk.23.attn_output.weightQ5_0[960, 960]
-
blk.23.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_v.weightQ5_0[960, 320]
-
blk.23.ffn_down.weightQ4_K[2560, 960]
-
blk.23.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.23.ffn_norm.weightF32[960]
-
blk.23.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.24.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.24.attn_norm.weightF32[960]
-
blk.24.attn_output.weightQ5_0[960, 960]
-
blk.24.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_v.weightQ5_0[960, 320]
-
blk.24.ffn_down.weightQ4_K[2560, 960]
-
blk.24.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.24.ffn_norm.weightF32[960]
-
blk.24.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.25.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.25.attn_norm.weightF32[960]
-
blk.25.attn_output.weightQ5_0[960, 960]
-
blk.25.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_v.weightQ5_0[960, 320]
-
blk.25.ffn_down.weightQ4_K[2560, 960]
-
blk.25.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.25.ffn_norm.weightF32[960]
-
blk.25.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.26.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.26.attn_norm.weightF32[960]
-
blk.26.attn_output.weightQ5_0[960, 960]
-
blk.26.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_v.weightQ5_0[960, 320]
-
blk.26.ffn_down.weightQ4_K[2560, 960]
-
blk.26.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.26.ffn_norm.weightF32[960]
-
blk.26.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.27.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.27.attn_norm.weightF32[960]
-
blk.27.attn_output.weightQ5_0[960, 960]
-
blk.27.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_v.weightQ5_0[960, 320]
-
blk.27.ffn_down.weightQ4_K[2560, 960]
-
blk.27.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.27.ffn_norm.weightF32[960]
-
blk.27.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.28.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.28.attn_norm.weightF32[960]
-
blk.28.attn_output.weightQ5_0[960, 960]
-
blk.28.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_v.weightQ5_0[960, 320]
-
blk.28.ffn_down.weightQ4_K[2560, 960]
-
blk.28.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.28.ffn_norm.weightF32[960]
-
blk.28.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.29.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.29.attn_norm.weightF32[960]
-
blk.29.attn_output.weightQ5_0[960, 960]
-
blk.29.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_v.weightQ5_0[960, 320]
-
blk.29.ffn_down.weightQ4_K[2560, 960]
-
blk.29.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.29.ffn_norm.weightF32[960]
-
blk.29.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.30.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.30.attn_norm.weightF32[960]
-
blk.30.attn_output.weightQ5_0[960, 960]
-
blk.30.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_v.weightQ5_0[960, 320]
-
blk.30.ffn_down.weightQ4_K[2560, 960]
-
blk.30.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.30.ffn_norm.weightF32[960]
-
blk.30.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.31.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.31.attn_norm.weightF32[960]
-
blk.31.attn_output.weightQ5_0[960, 960]
-
blk.31.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_v.weightQ5_0[960, 320]
-
blk.31.ffn_down.weightQ4_K[2560, 960]
-
blk.31.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.31.ffn_norm.weightF32[960]
-
blk.31.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
output_norm.weightF32[960]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29
blk.30
blk.31