Spaces:
Sleeping
Sleeping
LeeHarrold
commited on
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
@@ -43,18 +43,6 @@ class Inference:
|
|
43 |
self.sae = sae
|
44 |
self.cfg_dict = cfg_dict
|
45 |
|
46 |
-
def get_feature_info(self):
|
47 |
-
projection_onto_unembed = self.sae.W_dec @ self.model.W_U
|
48 |
-
# get the top ten words associated with the given feature
|
49 |
-
WORD_COUNT = 10
|
50 |
-
_, inds = torch.topk(projection_onto_unembed, WORD_COUNT, dim=1)
|
51 |
-
|
52 |
-
_, sv_feature_acts = self._get_sae_out_and_feature_activations()
|
53 |
-
features = self._get_features(sv_feature_acts)
|
54 |
-
breakpoint();
|
55 |
-
associated_words = [self.model.to_str_tokens(inds[f]) for f in features]
|
56 |
-
return associated_words
|
57 |
-
|
58 |
def _get_sae_out_and_feature_activations(self):
|
59 |
# given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
|
60 |
sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
|
@@ -79,7 +67,9 @@ class Inference:
|
|
79 |
# return torch.topk(sv_feature_acts, 1).indices.tolist()
|
80 |
features = torch.topk(sv_feature_activations, 1).indices
|
81 |
print(f'features that align with the text prompt: {features}')
|
82 |
-
|
|
|
|
|
83 |
|
84 |
def _get_steering_hook(self, feature, sae_out):
|
85 |
coeff = self.coeff
|
@@ -101,7 +91,7 @@ class Inference:
|
|
101 |
# and not use the seperate function _get_steering_hook()
|
102 |
sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
|
103 |
features = self._get_features(sv_feature_acts)
|
104 |
-
steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features]
|
105 |
|
106 |
return steering_hooks
|
107 |
|
@@ -159,15 +149,6 @@ def slow_echo_steering(message, history):
|
|
159 |
time.sleep(0.01)
|
160 |
yield result[: i + 1]
|
161 |
|
162 |
-
def populate_related_features():
|
163 |
-
features = chatbot_model.get_feature_info()
|
164 |
-
print(features)
|
165 |
-
return features[0]
|
166 |
-
# for feature in features:
|
167 |
-
# for i in range(len(feature)):
|
168 |
-
# time.sleep(0.01)
|
169 |
-
# yield feature[: i + 1]
|
170 |
-
|
171 |
with gr.Blocks() as demo:
|
172 |
with gr.Row():
|
173 |
gr.Markdown("*STANDARD HEXTER BOT*")
|
@@ -197,9 +178,6 @@ with gr.Blocks() as demo:
|
|
197 |
)
|
198 |
with gr.Row():
|
199 |
steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
|
200 |
-
found_features = gr.Textbox(label="Found Features")
|
201 |
-
find_features = gr.Button("Find Related Features")
|
202 |
-
find_features.click(fn=populate_related_features,inputs=None, outputs=found_features)
|
203 |
with gr.Row():
|
204 |
coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
|
205 |
with gr.Row():
|
|
|
43 |
self.sae = sae
|
44 |
self.cfg_dict = cfg_dict
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def _get_sae_out_and_feature_activations(self):
|
47 |
# given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
|
48 |
sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
|
|
|
67 |
# return torch.topk(sv_feature_acts, 1).indices.tolist()
|
68 |
features = torch.topk(sv_feature_activations, 1).indices
|
69 |
print(f'features that align with the text prompt: {features}')
|
70 |
+
print("pump the features into the tool that gives you the words associated with each feature")
|
71 |
+
return features
|
72 |
+
|
73 |
|
74 |
def _get_steering_hook(self, feature, sae_out):
|
75 |
coeff = self.coeff
|
|
|
91 |
# and not use the seperate function _get_steering_hook()
|
92 |
sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
|
93 |
features = self._get_features(sv_feature_acts)
|
94 |
+
steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features[0]]
|
95 |
|
96 |
return steering_hooks
|
97 |
|
|
|
149 |
time.sleep(0.01)
|
150 |
yield result[: i + 1]
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
with gr.Blocks() as demo:
|
153 |
with gr.Row():
|
154 |
gr.Markdown("*STANDARD HEXTER BOT*")
|
|
|
178 |
)
|
179 |
with gr.Row():
|
180 |
steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
|
|
|
|
|
|
|
181 |
with gr.Row():
|
182 |
coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
|
183 |
with gr.Row():
|