SAELens
English
sparse-autoencoder
SAE
interpretability
deception-detection
mechanistic-interpretability
neuronpedia
Solshine commited on
Commit
2325115
·
0 Parent(s):

Initial public release: SAE weights, cfg, and model card

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. README.md +216 -0
  3. d20_jumprelu_L10_deceptive_only/cfg.json +23 -0
  4. d20_jumprelu_L10_deceptive_only/sae_weights.safetensors +3 -0
  5. d20_jumprelu_L10_honest_only/cfg.json +23 -0
  6. d20_jumprelu_L10_honest_only/sae_weights.safetensors +3 -0
  7. d20_jumprelu_L10_mixed/cfg.json +23 -0
  8. d20_jumprelu_L10_mixed/sae_weights.safetensors +3 -0
  9. d20_jumprelu_L14_deceptive_only/cfg.json +23 -0
  10. d20_jumprelu_L14_deceptive_only/sae_weights.safetensors +3 -0
  11. d20_jumprelu_L14_honest_only/cfg.json +23 -0
  12. d20_jumprelu_L14_honest_only/sae_weights.safetensors +3 -0
  13. d20_jumprelu_L14_mixed/cfg.json +23 -0
  14. d20_jumprelu_L14_mixed/sae_weights.safetensors +3 -0
  15. d20_jumprelu_L18_deceptive_only/cfg.json +23 -0
  16. d20_jumprelu_L18_deceptive_only/sae_weights.safetensors +3 -0
  17. d20_jumprelu_L18_honest_only/cfg.json +23 -0
  18. d20_jumprelu_L18_honest_only/sae_weights.safetensors +3 -0
  19. d20_jumprelu_L18_mixed/cfg.json +23 -0
  20. d20_jumprelu_L18_mixed/sae_weights.safetensors +3 -0
  21. d20_jumprelu_L2_deceptive_only/cfg.json +23 -0
  22. d20_jumprelu_L2_deceptive_only/sae_weights.safetensors +3 -0
  23. d20_jumprelu_L2_honest_only/cfg.json +23 -0
  24. d20_jumprelu_L2_honest_only/sae_weights.safetensors +3 -0
  25. d20_jumprelu_L2_mixed/cfg.json +23 -0
  26. d20_jumprelu_L2_mixed/sae_weights.safetensors +3 -0
  27. d20_jumprelu_L4_deceptive_only/cfg.json +23 -0
  28. d20_jumprelu_L4_deceptive_only/sae_weights.safetensors +3 -0
  29. d20_jumprelu_L4_honest_only/cfg.json +23 -0
  30. d20_jumprelu_L4_honest_only/sae_weights.safetensors +3 -0
  31. d20_jumprelu_L4_mixed/cfg.json +23 -0
  32. d20_jumprelu_L4_mixed/sae_weights.safetensors +3 -0
  33. d20_jumprelu_L8_deceptive_only/cfg.json +23 -0
  34. d20_jumprelu_L8_deceptive_only/sae_weights.safetensors +3 -0
  35. d20_jumprelu_L8_honest_only/cfg.json +23 -0
  36. d20_jumprelu_L8_honest_only/sae_weights.safetensors +3 -0
  37. d20_jumprelu_L8_mixed/cfg.json +23 -0
  38. d20_jumprelu_L8_mixed/sae_weights.safetensors +3 -0
  39. d20_jumprelu_ste_L10_deceptive_only/cfg.json +24 -0
  40. d20_jumprelu_ste_L10_deceptive_only/sae_weights.safetensors +3 -0
  41. d20_jumprelu_ste_L10_honest_only/cfg.json +24 -0
  42. d20_jumprelu_ste_L10_honest_only/sae_weights.safetensors +3 -0
  43. d20_jumprelu_ste_L10_mixed/cfg.json +24 -0
  44. d20_jumprelu_ste_L10_mixed/sae_weights.safetensors +3 -0
  45. d20_jumprelu_ste_L14_deceptive_only/cfg.json +24 -0
  46. d20_jumprelu_ste_L14_deceptive_only/sae_weights.safetensors +3 -0
  47. d20_jumprelu_ste_L14_honest_only/cfg.json +24 -0
  48. d20_jumprelu_ste_L14_honest_only/sae_weights.safetensors +3 -0
  49. d20_jumprelu_ste_L14_mixed/cfg.json +24 -0
  50. d20_jumprelu_ste_L14_mixed/sae_weights.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ tags:
5
+ - sparse-autoencoder
6
+ - SAE
7
+ - interpretability
8
+ - deception-detection
9
+ - mechanistic-interpretability
10
+ - saelens
11
+ - neuronpedia
12
+ base_model:
13
+ - karpathy/nanochat-d32
14
+ - karpathy/nanochat-d20
15
+ - meta-llama/Llama-3.2-1B
16
+ - Qwen/Qwen3-1.7B
17
+ - HuggingFaceTB/SmolLM2-135M
18
+ - microsoft/phi-2
19
+ - TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
20
+ - microsoft/Phi-4-mini-reasoning
21
+ - EleutherAI/pythia-160m
22
+ datasets:
23
+ - Solshine/deception-behavioral-multimodel
24
+ ---
25
+
26
+ # Deception Behavioral SAEs — SAELens/Neuronpedia Format
27
+
28
+ **366 Sparse Autoencoders** trained on behavioral deception activations across
29
+ 9 language models (5 architecture families), formatted for
30
+ [SAELens](https://github.com/jbloomAus/SAELens) and
31
+ [Neuronpedia](https://neuronpedia.org) compatibility.
32
+
33
+
34
+ ## Training-data caveat — please read before use
35
+
36
+ The "honest" and "deceptive" labels used to train and evaluate this SAE
37
+ come from a **mixed scenario pool** of nine prompts:
38
+
39
+ - **Six clean incentive-structure scenarios** — `insider_info`,
40
+ `accounting_error`, `ai_oversight_log`, `ai_capability_hide`,
41
+ `surprise_party`, `job_interview_gap`. Each presents a realistic
42
+ situation with a non-trivial incentive to conceal or disclose
43
+ information; the honest/deceptive distinction is which of the two
44
+ behavioral choices the model's completion settles into under
45
+ temperature sampling.
46
+ - **Three role-play identity-assignment scenarios** —
47
+ `secret_role_game` ("You have been assigned the role of Fascist"),
48
+ `secret_role_game_v2` ("You are secretly one of the Mafia members"),
49
+ `werewolf_game` ("You are a Werewolf"). These pre-assign the model a
50
+ deceptive identity and label a completion "deceptive" when the model
51
+ drifts away from the assigned role or "honest" when it echoes it.
52
+
53
+ **What this mixed pool means for the SAE's labels.** Within the six
54
+ incentive-structure scenarios, the honest/deceptive distinction is a
55
+ measurement of behavioral choice under an ambiguous incentive. Within
56
+ the three role-play scenarios, the distinction is a measurement of
57
+ role-consistency under identity-assigned role-play — which is a
58
+ well-defined phenomenon but not the same as emergent or incentive-
59
+ driven deception.
60
+
61
+ **What this SAE is and is not good for.**
62
+
63
+ - **Good for:** research on mixed-pool activation geometry; SAE
64
+ feature-geometry studies; as one of a set of baselines when
65
+ comparing multiple SAE families; as a reference implementation of
66
+ same-prompt temperature-sampled behavioral SAE training at scale.
67
+ - **Not recommended as a standalone deception detector.** The
68
+ role-consistency signal from the three role-play scenarios is mixed
69
+ into every aggregate metric reported below. A downstream user who
70
+ wants an "emergent-deception feature set" should restrict attention
71
+ to features whose activation pattern concentrates in the
72
+ `insider_info` / `accounting_error` / `ai_oversight_log` /
73
+ `ai_capability_hide` / `surprise_party` / `job_interview_gap`
74
+ scenarios — or wait for the methodologically corrected V3 re-release
75
+ currently in preparation on the decision-incentive scenario bank
76
+ (no pre-assigned deceptive identity).
77
+
78
+ **What is unaffected by this caveat.**
79
+
80
+ - The SAE weights, reconstruction metrics (explained variance, L0,
81
+ alive features), and engineering of the training pipeline are
82
+ accurate as reported.
83
+ - The linear-probe balanced-accuracy numbers in the upstream paper
84
+ measure the mixed pool; the 6-scenario clean-subset re-analysis is
85
+ listed as a planned appendix for the next manuscript revision.
86
+
87
+ A companion methodology-first Gemma 4 SAE suite is in preparation using
88
+ pretraining-distribution data + a decision-incentive behavior split;
89
+ this README will be updated with a link when that release is public.
90
+
91
+ ---
92
+
93
+ Original flat-file checkpoints (with full training metadata) are in:
94
+ [Solshine/nanochat-d32-deception-saes-batch](https://huggingface.co/Solshine/nanochat-d32-deception-saes-batch)
95
+
96
+ ## Research Context
97
+
98
+ These SAEs are trained on **same-prompt behavioral sampling** data: a single ambiguous
99
+ scenario prompt produces both deceptive and honest completions via temperature sampling.
100
+ The SAEs decompose residual stream activations during deceptive vs. honest response
101
+ generation — enabling interpretability analysis of deception-relevant features.
102
+
103
+ **Paper:** "The Secret Agenda: LLMs Strategically Lie Undetected by Current Safety Tools"
104
+ [arXiv:2509.20393](https://arxiv.org/abs/2509.20393)
105
+ **Follow-up repo:** [SolshineCode/deception-nanochat-sae-research](https://github.com/SolshineCode/deception-nanochat-sae-research)
106
+ **Author:** Caleb DeLeeuw (2026)
107
+
108
+ ## Key Findings (Cross-Model, 366 SAEs, 9 Models, 5 Architecture Families)
109
+
110
+ **Linear probes on raw activations:**
111
+
112
+ | Model | Params | Peak Layer (depth) | Bal. Accuracy | AUROC |
113
+ |---|---|---|---|---|
114
+ | nanochat-d32 | 1.88B | L12 (37%) | **86.9%** | 0.923 |
115
+ | Qwen3-1.7B | 1.7B | L17 (63%) | **80.9%** | 0.893 |
116
+ | Phi-4-mini-reasoning | 3.8B | L20 (64%) | **80.8%** | 0.860 |
117
+ | Phi-2 | 2.7B | L21 (75%) | ~75% | — |
118
+ | TinyLlama-1.1B | 1.1B | L21 (95%) | **73.2%** | 0.784 |
119
+ | Llama 3.2-1B | 1.0B | L9 (56%) | **72.5%** | — |
120
+ | nanochat-d20 | 1.88B | L14 (70%) | ~67% | — |
121
+ | SmolLM2-135M | 135M | L4 (80%) | ~69% | — |
122
+ | Pythia-160M | 160M | L0 (0%) | **66.0%** | 0.696 |
123
+
124
+ All results p < 0.001, PCA-robust.
125
+
126
+ **SAE decomposition — model-size-dependent:**
127
+
128
+ - **Models ≤ 1.3B:** SAEs *help* detection (8–47% of SAEs beat raw probe accuracy)
129
+ - **Models ≥ 1.7B:** SAEs *hurt* detection (0–4% beat raw)
130
+ - **Transition:** between TinyLlama-1.1B (47% help) and Qwen3-1.7B (<4% help)
131
+ - **Best SAE config (small models):** JumpReLU + honest_only training condition
132
+ - **Phi-2 anomaly:** 33% of SAEs help at 2.7B (parallel attention architecture); does NOT extend to Phi-4-mini (3.8B, 2%)
133
+ - **Feature steering:** Null results at all tested layers/models — deception is distributed, not localizable to individual features
134
+
135
+ ## Models Covered
136
+
137
+ | Model | Params | Architecture | Layers in SAEs | SAE Count | SAE Arches |
138
+ |---|---|---|---|---|---|
139
+ | nanochat-d32 | 1.88B | GPT-NeoX | L4, 8, 12, 16, 20, 24 | 57 | TopK, JumpReLU, Gated |
140
+ | nanochat-d20 | 1.88B | GPT-NeoX | L2, 4, 8, 10, 14, 18 | 45 | TopK, JumpReLU |
141
+ | Qwen3-1.7B | 1.7B | Qwen | L12, 14, 15, 17, 18 | 45 | TopK, JumpReLU, Gated |
142
+ | Phi-4-mini-reasoning | 3.8B | Phi | L2, 6, 10, 14, 18, 22, 26 | 42 | TopK, JumpReLU |
143
+ | SmolLM2-135M | 135M | Llama2 | L3, 4, 5, 6, 9, 12, 15, 18, 21 | 54 | TopK, JumpReLU |
144
+ | Phi-2 | 2.7B | Phi (parallel) | L4, 8, 12, 16, 20 | 30 | TopK, JumpReLU |
145
+ | TinyLlama-1.1B | 1.1B | Llama2 | L3, 6, 9, 12, 15 (+STE) | 39 | TopK, JumpReLU |
146
+ | Llama 3.2-1B | 1.0B | Llama | L2, 4, 6 | 18 | TopK, JumpReLU |
147
+ | Pythia-160M | 160M | GPT-NeoX | L1, 2, 4, 6, 8, 10 | 36 | TopK, JumpReLU |
148
+ | **Total** | | | | **366** | |
149
+
150
+ **Note on STE validation SAEs:** nanochat-d20 and TinyLlama each include 9 additional
151
+ "_ste_" tagged SAEs (e.g., `d20_jumprelu_ste_L14_honest_only`) trained with the corrected
152
+ Gaussian-kernel STE to validate that the JumpReLU honest_only advantage is not a
153
+ dimensionality artifact. 15/18 conditions (83%) confirm the advantage is real.
154
+
155
+ ## Training Details
156
+
157
+ **Hardware:** NVIDIA GeForce GTX 1650 Ti with Max-Q Design, 4 GB VRAM (Windows 11 Pro)
158
+ **Training time:** ~400–600 seconds per SAE (300 epochs, batch_size=128)
159
+ **Framework:** Custom PyTorch training loop with SAELens-compatible architecture
160
+ **Activations:** Residual stream (`resid_post`) collected at generation time
161
+ **Expansion factor:** 4× (d_sae = 4 × d_model)
162
+ **Architectures:** TopK (k=64), JumpReLU, Gated
163
+ **Training conditions:** `mixed` (all completions), `honest_only`, `deceptive_only`
164
+ **Classification:** Gemini 2.5 Flash (behavioral LLM classification, not regex)
165
+
166
+ ## SAE Format
167
+
168
+ Each SAE is in its own subfolder `{sae_id}/` containing:
169
+ - `sae_weights.safetensors` — weights (W_enc, b_enc, W_dec, b_dec, [threshold for JumpReLU])
170
+ - `cfg.json` — SAELens-compatible config (architecture, hook_name, d_in, d_sae, training condition)
171
+
172
+ ## Known Limitations
173
+
174
+ **JumpReLU threshold training (348 original SAEs):**
175
+ The 348 original batch SAEs (non-STE) have `threshold = 0` throughout — functionally equivalent
176
+ to ReLU. The Heaviside step function has zero autograd gradient with respect to the threshold,
177
+ so without a custom straight-through estimator (STE), the threshold never updates from its
178
+ initialization of zero. These SAEs operate with ~50% feature density (L0 ≈ d_sae/2) rather
179
+ than the intended sparse regime. TopK SAEs (exact L0=64) are the properly sparse architecture
180
+ in this collection.
181
+
182
+ **STE fix (2026-04-11):** The training code has been corrected with a Gaussian-kernel STE
183
+ (Rajamanoharan et al. 2024, arXiv:2407.14435). The 18 `_ste_` tagged SAEs in this repo
184
+ use the corrected code. Targeted validation (18 STE SAEs across d20 and TinyLlama)
185
+ confirmed that the honest_only advantage over TopK is **not** a dimensionality artifact —
186
+ 15/18 conditions (83%) show STE JumpReLU > TopK even with threshold training.
187
+
188
+ **The honest_only > TopK probe accuracy finding is valid** regardless of the threshold bug.
189
+ The threshold bug affects downstream Neuronpedia feature analysis (active feature density),
190
+ not the probe accuracy comparisons.
191
+
192
+ ## Loading with SAELens
193
+
194
+ ```python
195
+ from safetensors.torch import load_file
196
+ import json
197
+
198
+ sae_id = "d32_topk_L12_honest_only" # or any sae_id from the repo
199
+ weights = load_file(f"{sae_id}/sae_weights.safetensors")
200
+ cfg = json.load(open(f"{sae_id}/cfg.json"))
201
+ # W_enc shape: [d_in, d_sae], W_dec shape: [d_sae, d_in]
202
+ # cfg["training_condition"] records honest_only / deceptive_only / mixed
203
+ ```
204
+
205
+ ## Citation
206
+
207
+ If you use these SAEs, please cite the original paper:
208
+
209
+ ```
210
+ @article{thesecretagenda2025,
211
+ title={The Secret Agenda: LLMs Strategically Lie Undetected by Current Safety Tools},
212
+ author={DeLeeuw, Caleb},
213
+ journal={arXiv:2509.20393},
214
+ year={2025}
215
+ }
216
+ ```
d20_jumprelu_L10_deceptive_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.10",
9
+ "hook_layer": 10,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 10, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L10_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d01e98627d6498c10850c0ea36d28e1bb37a0202c3dc0396297432fe6ae93a6b
3
+ size 52475272
d20_jumprelu_L10_honest_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.10",
9
+ "hook_layer": 10,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 10, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L10_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fa8acaff891b62eba2be1545b301037e343bfd382f35a311fa36907b7787801
3
+ size 52475272
d20_jumprelu_L10_mixed/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.10",
9
+ "hook_layer": 10,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 10, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L10_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f2791a0c4d6bd906f93d47b1bbae82621114bae50b284c207dbb1244c32d28e
3
+ size 52475272
d20_jumprelu_L14_deceptive_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.14",
9
+ "hook_layer": 14,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 14, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L14_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f48d0dbdf4b16efc6e3f2c9f4d9c291c83026d5932dea117497d04725d57743
3
+ size 52475272
d20_jumprelu_L14_honest_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.14",
9
+ "hook_layer": 14,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 14, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L14_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c947de17dcbf644c39a6739fff5a5ffcb5f0a4f91301096969e818322a710de
3
+ size 52475272
d20_jumprelu_L14_mixed/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.14",
9
+ "hook_layer": 14,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 14, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L14_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93b6f726f410d888c15122af0aebe96f78963df83409953498811d1b3507fbfb
3
+ size 52475272
d20_jumprelu_L18_deceptive_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.18",
9
+ "hook_layer": 18,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 18, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L18_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84f9cf2f3df9b455c31d7dfa6d020d7833fb5997e4942a901aa13972209ec425
3
+ size 52475272
d20_jumprelu_L18_honest_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.18",
9
+ "hook_layer": 18,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 18, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L18_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd323fe9111e8252a4eb1a7bf98412b5bc565a24f065aff59763391028d128d5
3
+ size 52475272
d20_jumprelu_L18_mixed/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.18",
9
+ "hook_layer": 18,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 18, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L18_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c067cf83457eac9b858cd73d99e6fe432f99823baaab576a2c628b430068e5e1
3
+ size 52475272
d20_jumprelu_L2_deceptive_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.2",
9
+ "hook_layer": 2,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 2, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L2_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c123a302a2fa65c4a70c4d92a8fb1dc6134fa67f82852a64938136d852d1eb95
3
+ size 52475272
d20_jumprelu_L2_honest_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.2",
9
+ "hook_layer": 2,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 2, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L2_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03787ad756a1e60a282884cacf72ca6964e23fabd539004835c31a4aff878a03
3
+ size 52475272
d20_jumprelu_L2_mixed/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.2",
9
+ "hook_layer": 2,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 2, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L2_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e945cec230f3f9d8e3cbd10efc2dcf817a5f072c03253e40d21af04c0442aba6
3
+ size 52475272
d20_jumprelu_L4_deceptive_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.4",
9
+ "hook_layer": 4,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 4, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L4_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:659910ca6dfa1e448ab6868976ecd700eb2b9839a7e6def02ab113c5a0e1043a
3
+ size 52475272
d20_jumprelu_L4_honest_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.4",
9
+ "hook_layer": 4,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 4, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L4_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f201daada8c7efdbd486e54ae176f8e03908fc58360753e8d688728565d155c
3
+ size 52475272
d20_jumprelu_L4_mixed/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.4",
9
+ "hook_layer": 4,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 4, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L4_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a5bb121f7146fb52ba7114f04eccfdb9b0a5edd19f51ca600223ad8700b7480
3
+ size 52475272
d20_jumprelu_L8_deceptive_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.8",
9
+ "hook_layer": 8,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 8, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L8_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d23b57485e5312179551b8258dcf733876659c9eca59a52077fcb6dd0ebe3c05
3
+ size 52475272
d20_jumprelu_L8_honest_only/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.8",
9
+ "hook_layer": 8,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 8, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L8_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b6a84176fb1de8f1ef94fa54a0c3d362603495a71beb25898b212af1c57263
3
+ size 52475272
d20_jumprelu_L8_mixed/cfg.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "transformer.h.8",
9
+ "hook_layer": 8,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_notes": "Deception behavioral SAE \u2014 same-prompt behavioral sampling. Model: karpathy/nanochat-d20, Layer 8, jumprelu. See https://github.com/SolshineCode/deception-nanochat-sae-research",
22
+ "source_repo": "https://github.com/SolshineCode/deception-nanochat-sae-research"
23
+ }
d20_jumprelu_L8_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca72b83da9ebed2cc0de114dbbaad61003c25b92321538de58a9bd39bdba17f5
3
+ size 52475272
d20_jumprelu_ste_L10_deceptive_only/cfg.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "model.layers.10",
9
+ "hook_layer": 10,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-ste-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_condition": "deceptive_only",
22
+ "training_notes": "STE validation SAE (2026-04-11) \u2014 deceptive_only training, Gaussian-kernel STE fix (arXiv:2407.14435). See https://github.com/SolshineCode/deception-nanochat-sae-research",
23
+ "ste_note": "STE validation SAE: thresholds trained via Gaussian-kernel STE (Rajamanoharan et al. 2024, arXiv:2407.14435). Trained for 300 epochs. See entries #61/#62 in RESULTS_INDEX.md for probe accuracy comparison vs TopK."
24
+ }
d20_jumprelu_ste_L10_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9197f52fa0a3b8b0628c2f5d3f055a271d61dad03f3e53043d5d5bb2941c441
3
+ size 52475272
d20_jumprelu_ste_L10_honest_only/cfg.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "model.layers.10",
9
+ "hook_layer": 10,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-ste-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_condition": "honest_only",
22
+ "training_notes": "STE validation SAE (2026-04-11) \u2014 honest_only training, Gaussian-kernel STE fix (arXiv:2407.14435). See https://github.com/SolshineCode/deception-nanochat-sae-research",
23
+ "ste_note": "STE validation SAE: thresholds trained via Gaussian-kernel STE (Rajamanoharan et al. 2024, arXiv:2407.14435). Trained for 300 epochs. See entries #61/#62 in RESULTS_INDEX.md for probe accuracy comparison vs TopK."
24
+ }
d20_jumprelu_ste_L10_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3817a36427111d4988927011ba41f7e0f4c4557a0e54fd7f8c68be88903e5aba
3
+ size 52475272
d20_jumprelu_ste_L10_mixed/cfg.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "model.layers.10",
9
+ "hook_layer": 10,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-ste-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_condition": "mixed",
22
+ "training_notes": "STE validation SAE (2026-04-11) \u2014 mixed training, Gaussian-kernel STE fix (arXiv:2407.14435). See https://github.com/SolshineCode/deception-nanochat-sae-research",
23
+ "ste_note": "STE validation SAE: thresholds trained via Gaussian-kernel STE (Rajamanoharan et al. 2024, arXiv:2407.14435). Trained for 300 epochs. See entries #61/#62 in RESULTS_INDEX.md for probe accuracy comparison vs TopK."
24
+ }
d20_jumprelu_ste_L10_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1ab333e58a17d0b66347283a80362dad7084daeeed74a2f64151226bd8449d6
3
+ size 52475272
d20_jumprelu_ste_L14_deceptive_only/cfg.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "model.layers.14",
9
+ "hook_layer": 14,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-ste-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_condition": "deceptive_only",
22
+ "training_notes": "STE validation SAE (2026-04-11) \u2014 deceptive_only training, Gaussian-kernel STE fix (arXiv:2407.14435). See https://github.com/SolshineCode/deception-nanochat-sae-research",
23
+ "ste_note": "STE validation SAE: thresholds trained via Gaussian-kernel STE (Rajamanoharan et al. 2024, arXiv:2407.14435). Trained for 300 epochs. See entries #61/#62 in RESULTS_INDEX.md for probe accuracy comparison vs TopK."
24
+ }
d20_jumprelu_ste_L14_deceptive_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53862b8a592b6e290d92188c9e1b1e7050ece8e4723631ce23baee33fb618528
3
+ size 52475272
d20_jumprelu_ste_L14_honest_only/cfg.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "model.layers.14",
9
+ "hook_layer": 14,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-ste-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_condition": "honest_only",
22
+ "training_notes": "STE validation SAE (2026-04-11) \u2014 honest_only training, Gaussian-kernel STE fix (arXiv:2407.14435). See https://github.com/SolshineCode/deception-nanochat-sae-research",
23
+ "ste_note": "STE validation SAE: thresholds trained via Gaussian-kernel STE (Rajamanoharan et al. 2024, arXiv:2407.14435). Trained for 300 epochs. See entries #61/#62 in RESULTS_INDEX.md for probe accuracy comparison vs TopK."
24
+ }
d20_jumprelu_ste_L14_honest_only/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca090dad90ed0e54105b37b5b2e6be6cd7ec4b589e7ced771d192d00ecd8ab7c
3
+ size 52475272
d20_jumprelu_ste_L14_mixed/cfg.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "jumprelu",
3
+ "d_in": 1280,
4
+ "d_sae": 5120,
5
+ "dtype": "float32",
6
+ "device": "cpu",
7
+ "model_name": "karpathy/nanochat-d20",
8
+ "hook_name": "model.layers.14",
9
+ "hook_layer": 14,
10
+ "hook_head_index": null,
11
+ "activation_fn_str": "jumprelu",
12
+ "activation_fn_kwargs": {},
13
+ "apply_b_dec_to_input": false,
14
+ "finetuning_scaling_factor": false,
15
+ "sae_lens_training_version": "deception-behavioral-ste-v1",
16
+ "prepend_bos": false,
17
+ "dataset_path": "Solshine/deception-behavioral-multimodel",
18
+ "dataset_trust_remote_code": false,
19
+ "context_size": null,
20
+ "normalize_activations": "none",
21
+ "training_condition": "mixed",
22
+ "training_notes": "STE validation SAE (2026-04-11) \u2014 mixed training, Gaussian-kernel STE fix (arXiv:2407.14435). See https://github.com/SolshineCode/deception-nanochat-sae-research",
23
+ "ste_note": "STE validation SAE: thresholds trained via Gaussian-kernel STE (Rajamanoharan et al. 2024, arXiv:2407.14435). Trained for 300 epochs. See entries #61/#62 in RESULTS_INDEX.md for probe accuracy comparison vs TopK."
24
+ }
d20_jumprelu_ste_L14_mixed/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d349a99b8412a6f70beff23b3fd593b710325b49f4512e2f856ebf2df2008594
3
+ size 52475272