Meta Llama-3-8b with Self-Play Preference Optimization for Language Model Alignment at iteration 3
186 Pulls Updated 4 months ago
5862a8175edc · 3.3GB
-
general.architecturellama
-
general.file_type23
-
general.name..
-
general.quantization_version2
-
llama.attention.head_count32
-
llama.attention.head_count_kv8
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count32
-
llama.context_length8192
-
llama.embedding_length4096
-
llama.feed_forward_length14336
-
llama.rope.dimension_count128
-
llama.rope.freq_base500000
-
llama.vocab_size128256
-
quantize.imatrix.chunks_count124
-
quantize.imatrix.dataset/shared/opt/work_models/_imatrix/calibration_datav3.txt
-
quantize.imatrix.entries_count224
-
quantize.imatrix.fileimatrix.dat
-
tokenizer.ggml.bos_token_id128000
-
tokenizer.ggml.eos_token_id128009
-
tokenizer.ggml.merges[Ġ Ġ, Ġ ĠĠĠ, ĠĠ ĠĠ, ĠĠĠ Ġ, i n, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.padding_token_id128009
-
tokenizer.ggml.prellama-bpe
-
tokenizer.ggml.token_type[1, 1, 1, 1, 1, ...]
-
tokenizer.ggml.tokens[!, ", #, $, %, ...]
-
NameTypeShape
-
token_embd.weight(!unknown_type 21!)[4096, 128256]
-
blk.0.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.0.attn_norm.weightF32[4096]
-
blk.0.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.0.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.0.attn_v.weightQ4_K[4096, 1024]
-
blk.0.ffn_down.weightI32[14336, 4096]
-
blk.0.ffn_gate.weightI32[4096, 14336]
-
blk.0.ffn_norm.weightF32[4096]
-
blk.0.ffn_up.weightI32[4096, 14336]
-
blk.1.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.1.attn_norm.weightF32[4096]
-
blk.1.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.1.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.1.attn_v.weightQ4_K[4096, 1024]
-
blk.1.ffn_down.weightI32[14336, 4096]
-
blk.1.ffn_gate.weightI32[4096, 14336]
-
blk.1.ffn_norm.weightF32[4096]
-
blk.1.ffn_up.weightI32[4096, 14336]
-
blk.2.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.2.attn_norm.weightF32[4096]
-
blk.2.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.2.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.2.attn_v.weightQ4_K[4096, 1024]
-
blk.2.ffn_down.weightI32[14336, 4096]
-
blk.2.ffn_gate.weightI32[4096, 14336]
-
blk.2.ffn_norm.weightF32[4096]
-
blk.2.ffn_up.weightI32[4096, 14336]
-
blk.3.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.3.attn_norm.weightF32[4096]
-
blk.3.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.3.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.3.attn_v.weightQ4_K[4096, 1024]
-
blk.3.ffn_down.weightI32[14336, 4096]
-
blk.3.ffn_gate.weightI32[4096, 14336]
-
blk.3.ffn_norm.weightF32[4096]
-
blk.3.ffn_up.weightI32[4096, 14336]
-
blk.4.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.4.attn_norm.weightF32[4096]
-
blk.4.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.4.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.4.attn_v.weightQ4_K[4096, 1024]
-
blk.4.ffn_down.weightI32[14336, 4096]
-
blk.4.ffn_gate.weightI32[4096, 14336]
-
blk.4.ffn_norm.weightF32[4096]
-
blk.4.ffn_up.weightI32[4096, 14336]
-
blk.5.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.5.attn_norm.weightF32[4096]
-
blk.5.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.5.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.5.attn_v.weightQ4_K[4096, 1024]
-
blk.5.ffn_down.weightI32[14336, 4096]
-
blk.5.ffn_gate.weightI32[4096, 14336]
-
blk.5.ffn_norm.weightF32[4096]
-
blk.5.ffn_up.weightI32[4096, 14336]
-
blk.6.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.6.attn_norm.weightF32[4096]
-
blk.6.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.6.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.6.attn_v.weightQ4_K[4096, 1024]
-
blk.6.ffn_down.weightI32[14336, 4096]
-
blk.6.ffn_gate.weightI32[4096, 14336]
-
blk.6.ffn_norm.weightF32[4096]
-
blk.6.ffn_up.weightI32[4096, 14336]
-
blk.7.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.7.attn_norm.weightF32[4096]
-
blk.7.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.7.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.7.attn_v.weightQ4_K[4096, 1024]
-
blk.7.ffn_down.weightI32[14336, 4096]
-
blk.7.ffn_gate.weightI32[4096, 14336]
-
blk.7.ffn_norm.weightF32[4096]
-
blk.7.ffn_up.weightI32[4096, 14336]
-
blk.8.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.8.attn_norm.weightF32[4096]
-
blk.8.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.8.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.8.attn_v.weightQ4_K[4096, 1024]
-
blk.8.ffn_down.weightI32[14336, 4096]
-
blk.8.ffn_gate.weightI32[4096, 14336]
-
blk.8.ffn_norm.weightF32[4096]
-
blk.8.ffn_up.weightI32[4096, 14336]
-
blk.9.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.9.attn_norm.weightF32[4096]
-
blk.9.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.9.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.9.attn_v.weightQ4_K[4096, 1024]
-
blk.9.ffn_down.weightI32[14336, 4096]
-
blk.9.ffn_gate.weightI32[4096, 14336]
-
blk.9.ffn_norm.weightF32[4096]
-
blk.9.ffn_up.weightI32[4096, 14336]
-
blk.10.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.10.attn_norm.weightF32[4096]
-
blk.10.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.10.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.10.attn_v.weightQ4_K[4096, 1024]
-
blk.10.ffn_down.weightI32[14336, 4096]
-
blk.10.ffn_gate.weightI32[4096, 14336]
-
blk.10.ffn_norm.weightF32[4096]
-
blk.10.ffn_up.weightI32[4096, 14336]
-
blk.11.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.11.attn_norm.weightF32[4096]
-
blk.11.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.11.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.11.attn_v.weightQ4_K[4096, 1024]
-
blk.11.ffn_down.weightI32[14336, 4096]
-
blk.11.ffn_gate.weightI32[4096, 14336]
-
blk.11.ffn_norm.weightF32[4096]
-
blk.11.ffn_up.weightI32[4096, 14336]
-
blk.12.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.12.attn_norm.weightF32[4096]
-
blk.12.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.12.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.12.attn_v.weightQ4_K[4096, 1024]
-
blk.12.ffn_down.weightI32[14336, 4096]
-
blk.12.ffn_gate.weightI32[4096, 14336]
-
blk.12.ffn_norm.weightF32[4096]
-
blk.12.ffn_up.weightI32[4096, 14336]
-
blk.13.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.13.attn_norm.weightF32[4096]
-
blk.13.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.13.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.13.attn_v.weightQ4_K[4096, 1024]
-
blk.13.ffn_down.weightI32[14336, 4096]
-
blk.13.ffn_gate.weightI32[4096, 14336]
-
blk.13.ffn_norm.weightF32[4096]
-
blk.13.ffn_up.weightI32[4096, 14336]
-
blk.14.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.14.attn_norm.weightF32[4096]
-
blk.14.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.14.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.14.attn_v.weightQ4_K[4096, 1024]
-
blk.14.ffn_down.weightI32[14336, 4096]
-
blk.14.ffn_gate.weightI32[4096, 14336]
-
blk.14.ffn_norm.weightF32[4096]
-
blk.14.ffn_up.weightI32[4096, 14336]
-
blk.15.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.15.attn_norm.weightF32[4096]
-
blk.15.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.15.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.15.attn_v.weightQ4_K[4096, 1024]
-
blk.15.ffn_down.weightI32[14336, 4096]
-
blk.15.ffn_gate.weightI32[4096, 14336]
-
blk.15.ffn_norm.weightF32[4096]
-
blk.15.ffn_up.weightI32[4096, 14336]
-
blk.16.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.16.attn_norm.weightF32[4096]
-
blk.16.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.16.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.16.attn_v.weightQ4_K[4096, 1024]
-
blk.16.ffn_down.weightI32[14336, 4096]
-
blk.16.ffn_gate.weightI32[4096, 14336]
-
blk.16.ffn_norm.weightF32[4096]
-
blk.16.ffn_up.weightI32[4096, 14336]
-
blk.17.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.17.attn_norm.weightF32[4096]
-
blk.17.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.17.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.17.attn_v.weightQ4_K[4096, 1024]
-
blk.17.ffn_down.weightI32[14336, 4096]
-
blk.17.ffn_gate.weightI32[4096, 14336]
-
blk.17.ffn_norm.weightF32[4096]
-
blk.17.ffn_up.weightI32[4096, 14336]
-
blk.18.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.18.attn_norm.weightF32[4096]
-
blk.18.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.18.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.18.attn_v.weightQ4_K[4096, 1024]
-
blk.18.ffn_down.weightI32[14336, 4096]
-
blk.18.ffn_gate.weightI32[4096, 14336]
-
blk.18.ffn_norm.weightF32[4096]
-
blk.18.ffn_up.weightI32[4096, 14336]
-
blk.19.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.19.attn_norm.weightF32[4096]
-
blk.19.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.19.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.19.attn_v.weightQ4_K[4096, 1024]
-
blk.19.ffn_down.weightI32[14336, 4096]
-
blk.19.ffn_gate.weightI32[4096, 14336]
-
blk.19.ffn_norm.weightF32[4096]
-
blk.19.ffn_up.weightI32[4096, 14336]
-
blk.20.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.20.attn_norm.weightF32[4096]
-
blk.20.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.20.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.20.attn_v.weightQ4_K[4096, 1024]
-
blk.20.ffn_down.weightI32[14336, 4096]
-
blk.20.ffn_gate.weightI32[4096, 14336]
-
blk.20.ffn_norm.weightF32[4096]
-
blk.20.ffn_up.weightI32[4096, 14336]
-
blk.21.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.21.attn_norm.weightF32[4096]
-
blk.21.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.21.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.21.attn_v.weightQ4_K[4096, 1024]
-
blk.21.ffn_down.weightI32[14336, 4096]
-
blk.21.ffn_gate.weightI32[4096, 14336]
-
blk.21.ffn_norm.weightF32[4096]
-
blk.21.ffn_up.weightI32[4096, 14336]
-
blk.22.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.22.attn_norm.weightF32[4096]
-
blk.22.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.22.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.22.attn_v.weightQ4_K[4096, 1024]
-
blk.22.ffn_down.weightI32[14336, 4096]
-
blk.22.ffn_gate.weightI32[4096, 14336]
-
blk.22.ffn_norm.weightF32[4096]
-
blk.22.ffn_up.weightI32[4096, 14336]
-
blk.23.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.23.attn_norm.weightF32[4096]
-
blk.23.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.23.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.23.attn_v.weightQ4_K[4096, 1024]
-
blk.23.ffn_down.weightI32[14336, 4096]
-
blk.23.ffn_gate.weightI32[4096, 14336]
-
blk.23.ffn_norm.weightF32[4096]
-
blk.23.ffn_up.weightI32[4096, 14336]
-
blk.24.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.24.attn_norm.weightF32[4096]
-
blk.24.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.24.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.24.attn_v.weightQ4_K[4096, 1024]
-
blk.24.ffn_down.weightI32[14336, 4096]
-
blk.24.ffn_gate.weightI32[4096, 14336]
-
blk.24.ffn_norm.weightF32[4096]
-
blk.24.ffn_up.weightI32[4096, 14336]
-
blk.25.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.25.attn_norm.weightF32[4096]
-
blk.25.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.25.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.25.attn_v.weightQ4_K[4096, 1024]
-
blk.25.ffn_down.weightI32[14336, 4096]
-
blk.25.ffn_gate.weightI32[4096, 14336]
-
blk.25.ffn_norm.weightF32[4096]
-
blk.25.ffn_up.weightI32[4096, 14336]
-
blk.26.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.26.attn_norm.weightF32[4096]
-
blk.26.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.26.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.26.attn_v.weightQ4_K[4096, 1024]
-
blk.26.ffn_down.weightI32[14336, 4096]
-
blk.26.ffn_gate.weightI32[4096, 14336]
-
blk.26.ffn_norm.weightF32[4096]
-
blk.26.ffn_up.weightI32[4096, 14336]
-
blk.27.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.27.attn_norm.weightF32[4096]
-
blk.27.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.27.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.27.attn_v.weightQ4_K[4096, 1024]
-
blk.27.ffn_down.weightI32[14336, 4096]
-
blk.27.ffn_gate.weightI32[4096, 14336]
-
blk.27.ffn_norm.weightF32[4096]
-
blk.27.ffn_up.weightI32[4096, 14336]
-
blk.28.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.28.attn_norm.weightF32[4096]
-
blk.28.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.28.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.28.attn_v.weightQ4_K[4096, 1024]
-
blk.28.ffn_down.weightI32[14336, 4096]
-
blk.28.ffn_gate.weightI32[4096, 14336]
-
blk.28.ffn_norm.weightF32[4096]
-
blk.28.ffn_up.weightI32[4096, 14336]
-
blk.29.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.29.attn_norm.weightF32[4096]
-
blk.29.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.29.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.29.attn_v.weightQ4_K[4096, 1024]
-
blk.29.ffn_down.weightI32[14336, 4096]
-
blk.29.ffn_gate.weightI32[4096, 14336]
-
blk.29.ffn_norm.weightF32[4096]
-
blk.29.ffn_up.weightI32[4096, 14336]
-
blk.30.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.30.attn_norm.weightF32[4096]
-
blk.30.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.30.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.30.attn_v.weightQ4_K[4096, 1024]
-
blk.30.ffn_down.weightI32[14336, 4096]
-
blk.30.ffn_gate.weightI32[4096, 14336]
-
blk.30.ffn_norm.weightF32[4096]
-
blk.30.ffn_up.weightI32[4096, 14336]
-
blk.31.attn_k.weight(!unknown_type 22!)[4096, 1024]
-
blk.31.attn_norm.weightF32[4096]
-
blk.31.attn_output.weight(!unknown_type 21!)[4096, 4096]
-
blk.31.attn_q.weight(!unknown_type 22!)[4096, 4096]
-
blk.31.attn_v.weightQ4_K[4096, 1024]
-
blk.31.ffn_down.weightI32[14336, 4096]
-
blk.31.ffn_gate.weightI32[4096, 14336]
-
blk.31.ffn_norm.weightF32[4096]
-
blk.31.ffn_up.weightI32[4096, 14336]
-
output.weightQ5_K[4096, 128256]
-
output_norm.weightF32[4096]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29
blk.30
blk.31