Spaces:

LeeHarrold
/

steer-hexter

Sleeping

App Files Files Community

LeeHarrold commited on Jul 23, 2024

Commit

f01dd44

verified ·

1 Parent(s): 7647796

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +4 -26

app.py CHANGED Viewed

@@ -43,18 +43,6 @@ class Inference:
         self.sae = sae
         self.cfg_dict = cfg_dict
-    def get_feature_info(self):
-        projection_onto_unembed = self.sae.W_dec @ self.model.W_U
-        # get the top ten words associated with the given feature
-        WORD_COUNT = 10
-        _, inds = torch.topk(projection_onto_unembed, WORD_COUNT, dim=1)
-        _, sv_feature_acts = self._get_sae_out_and_feature_activations()
-        features = self._get_features(sv_feature_acts)
-        breakpoint();
-        associated_words = [self.model.to_str_tokens(inds[f]) for f in features]
-        return associated_words
     def _get_sae_out_and_feature_activations(self):
         # given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
         sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
@@ -79,7 +67,9 @@ class Inference:
         # return torch.topk(sv_feature_acts, 1).indices.tolist()
         features = torch.topk(sv_feature_activations, 1).indices
         print(f'features that align with the text prompt: {features}')
-        return features[0]
     def _get_steering_hook(self, feature, sae_out):
         coeff = self.coeff
@@ -101,7 +91,7 @@ class Inference:
         # and not use the seperate function _get_steering_hook()
         sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
         features = self._get_features(sv_feature_acts)
-        steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features]
         return steering_hooks
@@ -159,15 +149,6 @@ def slow_echo_steering(message, history):
         time.sleep(0.01)
         yield result[: i + 1]
-def populate_related_features():
-    features = chatbot_model.get_feature_info()
-    print(features)
-    return features[0]
-    # for feature in features:
-    #     for i in range(len(feature)):
-    #         time.sleep(0.01)
-    #         yield feature[: i + 1]
 with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown("*STANDARD HEXTER BOT*")
@@ -197,9 +178,6 @@ with gr.Blocks() as demo:
         )
     with gr.Row():
         steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
-        found_features = gr.Textbox(label="Found Features")
-        find_features = gr.Button("Find Related Features")
-        find_features.click(fn=populate_related_features,inputs=None, outputs=found_features)
     with gr.Row():
         coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
     with gr.Row():

         self.sae = sae
         self.cfg_dict = cfg_dict
     def _get_sae_out_and_feature_activations(self):
         # given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
         sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
         # return torch.topk(sv_feature_acts, 1).indices.tolist()
         features = torch.topk(sv_feature_activations, 1).indices
         print(f'features that align with the text prompt: {features}')
+        print("pump the features into the tool that gives you the words associated with each feature")
+        return features
     def _get_steering_hook(self, feature, sae_out):
         coeff = self.coeff
         # and not use the seperate function _get_steering_hook()
         sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
         features = self._get_features(sv_feature_acts)
+        steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features[0]]
         return steering_hooks
         time.sleep(0.01)
         yield result[: i + 1]
 with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown("*STANDARD HEXTER BOT*")
         )
     with gr.Row():
         steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
     with gr.Row():
         coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
     with gr.Row():