LeeHarrold commited on
Commit
f01dd44
·
verified ·
1 Parent(s): 7647796

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +4 -26
app.py CHANGED
@@ -43,18 +43,6 @@ class Inference:
43
  self.sae = sae
44
  self.cfg_dict = cfg_dict
45
 
46
- def get_feature_info(self):
47
- projection_onto_unembed = self.sae.W_dec @ self.model.W_U
48
- # get the top ten words associated with the given feature
49
- WORD_COUNT = 10
50
- _, inds = torch.topk(projection_onto_unembed, WORD_COUNT, dim=1)
51
-
52
- _, sv_feature_acts = self._get_sae_out_and_feature_activations()
53
- features = self._get_features(sv_feature_acts)
54
- breakpoint();
55
- associated_words = [self.model.to_str_tokens(inds[f]) for f in features]
56
- return associated_words
57
-
58
  def _get_sae_out_and_feature_activations(self):
59
  # given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
60
  sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
@@ -79,7 +67,9 @@ class Inference:
79
  # return torch.topk(sv_feature_acts, 1).indices.tolist()
80
  features = torch.topk(sv_feature_activations, 1).indices
81
  print(f'features that align with the text prompt: {features}')
82
- return features[0]
 
 
83
 
84
  def _get_steering_hook(self, feature, sae_out):
85
  coeff = self.coeff
@@ -101,7 +91,7 @@ class Inference:
101
  # and not use the seperate function _get_steering_hook()
102
  sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
103
  features = self._get_features(sv_feature_acts)
104
- steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features]
105
 
106
  return steering_hooks
107
 
@@ -159,15 +149,6 @@ def slow_echo_steering(message, history):
159
  time.sleep(0.01)
160
  yield result[: i + 1]
161
 
162
- def populate_related_features():
163
- features = chatbot_model.get_feature_info()
164
- print(features)
165
- return features[0]
166
- # for feature in features:
167
- # for i in range(len(feature)):
168
- # time.sleep(0.01)
169
- # yield feature[: i + 1]
170
-
171
  with gr.Blocks() as demo:
172
  with gr.Row():
173
  gr.Markdown("*STANDARD HEXTER BOT*")
@@ -197,9 +178,6 @@ with gr.Blocks() as demo:
197
  )
198
  with gr.Row():
199
  steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
200
- found_features = gr.Textbox(label="Found Features")
201
- find_features = gr.Button("Find Related Features")
202
- find_features.click(fn=populate_related_features,inputs=None, outputs=found_features)
203
  with gr.Row():
204
  coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
205
  with gr.Row():
 
43
  self.sae = sae
44
  self.cfg_dict = cfg_dict
45
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def _get_sae_out_and_feature_activations(self):
47
  # given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
48
  sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
 
67
  # return torch.topk(sv_feature_acts, 1).indices.tolist()
68
  features = torch.topk(sv_feature_activations, 1).indices
69
  print(f'features that align with the text prompt: {features}')
70
+ print("pump the features into the tool that gives you the words associated with each feature")
71
+ return features
72
+
73
 
74
  def _get_steering_hook(self, feature, sae_out):
75
  coeff = self.coeff
 
91
  # and not use the seperate function _get_steering_hook()
92
  sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
93
  features = self._get_features(sv_feature_acts)
94
+ steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features[0]]
95
 
96
  return steering_hooks
97
 
 
149
  time.sleep(0.01)
150
  yield result[: i + 1]
151
 
 
 
 
 
 
 
 
 
 
152
  with gr.Blocks() as demo:
153
  with gr.Row():
154
  gr.Markdown("*STANDARD HEXTER BOT*")
 
178
  )
179
  with gr.Row():
180
  steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
 
 
 
181
  with gr.Row():
182
  coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
183
  with gr.Row():