charaf/gemma4-31b-claude-opus-abliterated/json

charaf/ gemma4-31b-claude-opus-abliterated:latest

31 Downloads Updated 5 hours ago

gemma4-31b-claude-opus-abliterated:latest ... /

json

8c0a85f62133 · 2.0kB

{

"source_model": "TeichAI/gemma-4-31B-it-Claude-Opus-Distill",

"technique": "refusal_direction_ablation",

"method": "advanced",

"method_config": {

"n_directions": 4,

"direction_method": "svd",

"norm_preserve": true,

"regularization": 0.3,

"refinement_passes": 2,

"project_biases": true,

"use_chat_template": true,

"use_whitened_svd": false,

"true_iterative_refinement": false,

"winsorize_activations": false,

"float_layer_interpolation": false,

"cot_aware": false,

"use_kl_optimization": false,

"use_lora_ablation": false,

"spectral_cascade": false,

"spectral_bands": 3,

"spectral_threshold": 0.05

"references": [

"Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)",

"Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",

"Norm-Preserving Biprojected Abliteration (grimjim, 2025)",

"Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)",

"Joad et al., More to Refusal than a Single Direction (2026)",

"Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization",

"OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)"

"strong_layers": [

40,

39,

42,

41,

43,

52,

44,

45,

46,

51,

53,

50,

48,

49,

47,

38,

55,

54,

56,

57,

34,

58,

36,

35,

33,

37,

32,

31,

30,

27,

26,

25,

29,

24,

28,

23,

22,

"n_harmful_prompts": 512,

"n_harmless_prompts": 512,

"quality_metrics": {

"perplexity": 6432.466652702751,

"coherence": 0.2,

"refusal_rate": 0.0,

"kl_divergence": 6.550369739532471,

"spectral_certification": "RED"

"kl_contributions": {},

"cot_preserved_layers": [],

"float_layer_weights": {},

"lora_adapters_saved": false

}