7 2 months ago

This model uses the followin training techniques. Please note that this is a play model for learning. It has been trained on tool use but due to its extremely small foundation model, it is unlikely to give reliable tool output. The main goal was to enable

1dec43734f33 · 252MB
    Metadata
  • general.architecture
    gpt2
  • general.file_type
    F16
  • gpt2.attention.head_count
    12
  • gpt2.attention.layer_norm_epsilon
    1e-05
  • gpt2.block_count
    12
  • gpt2.context_length
    1024
  • gpt2.embedding_length
    768
  • gpt2.feed_forward_length
    3072
  • tokenizer.ggml.add_bos_token
    false
  • tokenizer.ggml.bos_token_id
    50256
  • tokenizer.ggml.eos_token_id
    50256
  • tokenizer.ggml.merges
    [Ġ t, Ġ a, h e, i n, r e, ...]
  • tokenizer.ggml.model
    gpt2
  • tokenizer.ggml.pre
    gpt-2
  • tokenizer.ggml.token_type
    [1, 1, 1, 1, 1, ...]
  • tokenizer.ggml.tokens
    [!, ", #, $, %, ...]
  • Tensor
  • token_embd.weight
    F16
    [768, 50262]
  • blk.0
  • blk.0.attn_norm.bias
    F32
    [768]
  • blk.0.attn_norm.weight
    F32
    [768]
  • blk.0.attn_output.bias
    F32
    [768]
  • blk.0.attn_output.weight
    F16
    [768, 768]
  • blk.0.attn_qkv.bias
    F32
    [2304]
  • blk.0.attn_qkv.weight
    F16
    [768, 2304]
  • blk.0.ffn_down.bias
    F32
    [768]
  • blk.0.ffn_down.weight
    F16
    [3072, 768]
  • blk.0.ffn_norm.bias
    F32
    [768]
  • blk.0.ffn_norm.weight
    F32
    [768]
  • blk.0.ffn_up.bias
    F32
    [3072]
  • blk.0.ffn_up.weight
    F16
    [768, 3072]
  • blk.1
  • blk.1.attn_norm.bias
    F32
    [768]
  • blk.1.attn_norm.weight
    F32
    [768]
  • blk.1.attn_output.bias
    F32
    [768]
  • blk.1.attn_output.weight
    F16
    [768, 768]
  • blk.1.attn_qkv.bias
    F32
    [2304]
  • blk.1.attn_qkv.weight
    F16
    [768, 2304]
  • blk.1.ffn_down.bias
    F32
    [768]
  • blk.1.ffn_down.weight
    F16
    [3072, 768]
  • blk.1.ffn_norm.bias
    F32
    [768]
  • blk.1.ffn_norm.weight
    F32
    [768]
  • blk.1.ffn_up.bias
    F32
    [3072]
  • blk.1.ffn_up.weight
    F16
    [768, 3072]
  • blk.2
  • blk.2.attn_norm.bias
    F32
    [768]
  • blk.2.attn_norm.weight
    F32
    [768]
  • blk.2.attn_output.bias
    F32
    [768]
  • blk.2.attn_output.weight
    F16
    [768, 768]
  • blk.2.attn_qkv.bias
    F32
    [2304]
  • blk.2.attn_qkv.weight
    F16
    [768, 2304]
  • blk.2.ffn_down.bias
    F32
    [768]
  • blk.2.ffn_down.weight
    F16
    [3072, 768]
  • blk.2.ffn_norm.bias
    F32
    [768]
  • blk.2.ffn_norm.weight
    F32
    [768]
  • blk.2.ffn_up.bias
    F32
    [3072]
  • blk.2.ffn_up.weight
    F16
    [768, 3072]
  • blk.3
  • blk.3.attn_norm.bias
    F32
    [768]
  • blk.3.attn_norm.weight
    F32
    [768]
  • blk.3.attn_output.bias
    F32
    [768]
  • blk.3.attn_output.weight
    F16
    [768, 768]
  • blk.3.attn_qkv.bias
    F32
    [2304]
  • blk.3.attn_qkv.weight
    F16
    [768, 2304]
  • blk.3.ffn_down.bias
    F32
    [768]
  • blk.3.ffn_down.weight
    F16
    [3072, 768]
  • blk.3.ffn_norm.bias
    F32
    [768]
  • blk.3.ffn_norm.weight
    F32
    [768]
  • blk.3.ffn_up.bias
    F32
    [3072]
  • blk.3.ffn_up.weight
    F16
    [768, 3072]
  • blk.4
  • blk.4.attn_norm.bias
    F32
    [768]
  • blk.4.attn_norm.weight
    F32
    [768]
  • blk.4.attn_output.bias
    F32
    [768]
  • blk.4.attn_output.weight
    F16
    [768, 768]
  • blk.4.attn_qkv.bias
    F32
    [2304]
  • blk.4.attn_qkv.weight
    F16
    [768, 2304]
  • blk.4.ffn_down.bias
    F32
    [768]
  • blk.4.ffn_down.weight
    F16
    [3072, 768]
  • blk.4.ffn_norm.bias
    F32
    [768]
  • blk.4.ffn_norm.weight
    F32
    [768]
  • blk.4.ffn_up.bias
    F32
    [3072]
  • blk.4.ffn_up.weight
    F16
    [768, 3072]
  • blk.5
  • blk.5.attn_norm.bias
    F32
    [768]
  • blk.5.attn_norm.weight
    F32
    [768]
  • blk.5.attn_output.bias
    F32
    [768]
  • blk.5.attn_output.weight
    F16
    [768, 768]
  • blk.5.attn_qkv.bias
    F32
    [2304]
  • blk.5.attn_qkv.weight
    F16
    [768, 2304]
  • blk.5.ffn_down.bias
    F32
    [768]
  • blk.5.ffn_down.weight
    F16
    [3072, 768]
  • blk.5.ffn_norm.bias
    F32
    [768]
  • blk.5.ffn_norm.weight
    F32
    [768]
  • blk.5.ffn_up.bias
    F32
    [3072]
  • blk.5.ffn_up.weight
    F16
    [768, 3072]
  • blk.6
  • blk.6.attn_norm.bias
    F32
    [768]
  • blk.6.attn_norm.weight
    F32
    [768]
  • blk.6.attn_output.bias
    F32
    [768]
  • blk.6.attn_output.weight
    F16
    [768, 768]
  • blk.6.attn_qkv.bias
    F32
    [2304]
  • blk.6.attn_qkv.weight
    F16
    [768, 2304]
  • blk.6.ffn_down.bias
    F32
    [768]
  • blk.6.ffn_down.weight
    F16
    [3072, 768]
  • blk.6.ffn_norm.bias
    F32
    [768]
  • blk.6.ffn_norm.weight
    F32
    [768]
  • blk.6.ffn_up.bias
    F32
    [3072]
  • blk.6.ffn_up.weight
    F16
    [768, 3072]
  • blk.7
  • blk.7.attn_norm.bias
    F32
    [768]
  • blk.7.attn_norm.weight
    F32
    [768]
  • blk.7.attn_output.bias
    F32
    [768]
  • blk.7.attn_output.weight
    F16
    [768, 768]
  • blk.7.attn_qkv.bias
    F32
    [2304]
  • blk.7.attn_qkv.weight
    F16
    [768, 2304]
  • blk.7.ffn_down.bias
    F32
    [768]
  • blk.7.ffn_down.weight
    F16
    [3072, 768]
  • blk.7.ffn_norm.bias
    F32
    [768]
  • blk.7.ffn_norm.weight
    F32
    [768]
  • blk.7.ffn_up.bias
    F32
    [3072]
  • blk.7.ffn_up.weight
    F16
    [768, 3072]
  • blk.8
  • blk.8.attn_norm.bias
    F32
    [768]
  • blk.8.attn_norm.weight
    F32
    [768]
  • blk.8.attn_output.bias
    F32
    [768]
  • blk.8.attn_output.weight
    F16
    [768, 768]
  • blk.8.attn_qkv.bias
    F32
    [2304]
  • blk.8.attn_qkv.weight
    F16
    [768, 2304]
  • blk.8.ffn_down.bias
    F32
    [768]
  • blk.8.ffn_down.weight
    F16
    [3072, 768]
  • blk.8.ffn_norm.bias
    F32
    [768]
  • blk.8.ffn_norm.weight
    F32
    [768]
  • blk.8.ffn_up.bias
    F32
    [3072]
  • blk.8.ffn_up.weight
    F16
    [768, 3072]
  • blk.9
  • blk.9.attn_norm.bias
    F32
    [768]
  • blk.9.attn_norm.weight
    F32
    [768]
  • blk.9.attn_output.bias
    F32
    [768]
  • blk.9.attn_output.weight
    F16
    [768, 768]
  • blk.9.attn_qkv.bias
    F32
    [2304]
  • blk.9.attn_qkv.weight
    F16
    [768, 2304]
  • blk.9.ffn_down.bias
    F32
    [768]
  • blk.9.ffn_down.weight
    F16
    [3072, 768]
  • blk.9.ffn_norm.bias
    F32
    [768]
  • blk.9.ffn_norm.weight
    F32
    [768]
  • blk.9.ffn_up.bias
    F32
    [3072]
  • blk.9.ffn_up.weight
    F16
    [768, 3072]
  • blk.10
  • blk.10.attn_norm.bias
    F32
    [768]
  • blk.10.attn_norm.weight
    F32
    [768]
  • blk.10.attn_output.bias
    F32
    [768]
  • blk.10.attn_output.weight
    F16
    [768, 768]
  • blk.10.attn_qkv.bias
    F32
    [2304]
  • blk.10.attn_qkv.weight
    F16
    [768, 2304]
  • blk.10.ffn_down.bias
    F32
    [768]
  • blk.10.ffn_down.weight
    F16
    [3072, 768]
  • blk.10.ffn_norm.bias
    F32
    [768]
  • blk.10.ffn_norm.weight
    F32
    [768]
  • blk.10.ffn_up.bias
    F32
    [3072]
  • blk.10.ffn_up.weight
    F16
    [768, 3072]
  • blk.11
  • blk.11.attn_norm.bias
    F32
    [768]
  • blk.11.attn_norm.weight
    F32
    [768]
  • blk.11.attn_output.bias
    F32
    [768]
  • blk.11.attn_output.weight
    F16
    [768, 768]
  • blk.11.attn_qkv.bias
    F32
    [2304]
  • blk.11.attn_qkv.weight
    F16
    [768, 2304]
  • blk.11.ffn_down.bias
    F32
    [768]
  • blk.11.ffn_down.weight
    F16
    [3072, 768]
  • blk.11.ffn_norm.bias
    F32
    [768]
  • blk.11.ffn_norm.weight
    F32
    [768]
  • blk.11.ffn_up.bias
    F32
    [3072]
  • blk.11.ffn_up.weight
    F16
    [768, 3072]
  • output_norm.bias
    F32
    [768]
  • position_embd.weight
    F32
    [768, 1024]
  • output_norm.weight
    F32
    [768]