Spaces:

wandb
/

guardrails-genie

Running

App Files Files Community

geekyrakshit commited on Dec 3, 2024

Commit

b207b4c

1 Parent(s): 096a26c

add: docs for prompt injection guardrails

Browse files

Files changed (10) hide show

docs/guardrails/base.md +3 -0
docs/guardrails/manager.md +3 -0
docs/guardrails/prompt_injection/classifier.md +3 -0
docs/guardrails/prompt_injection/llm_survey.md +3 -0
guardrails_genie/guardrails/base.py +19 -0
guardrails_genie/guardrails/injection/classifier_guardrail.py +27 -0
guardrails_genie/guardrails/injection/survey_guardrail.py +77 -0
guardrails_genie/guardrails/manager.py +55 -0
guardrails_genie/llm.py +20 -19
mkdocs.yml +6 -0

docs/guardrails/base.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Guardrail Base Class
2	+
3	+ ::: guardrails_genie.guardrails.base

docs/guardrails/manager.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Guardrail Manager
2	+
3	+ ::: guardrails_genie.guardrails.manager

docs/guardrails/prompt_injection/classifier.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Prompt Injection Classifier Guardrail
2	+
3	+ ::: guardrails_genie.guardrails.injection.classifier_guardrail

docs/guardrails/prompt_injection/llm_survey.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Survey Guardrail
2	+
3	+ ::: guardrails_genie.guardrails.injection.survey_guardrail

guardrails_genie/guardrails/base.py CHANGED Viewed

@@ -4,6 +4,25 @@ import weave
 class Guardrail(weave.Model):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

 class Guardrail(weave.Model):
+    """
+    The Guardrail class is an abstract base class that extends the weave.Model.
+    This class is designed to provide a framework for implementing guardrails
+    in the form of the `guard` method. The `guard` method is an abstract method
+    that must be implemented by any subclass. It takes a prompt string and
+    additional keyword arguments, and returns a list of strings. The specific
+    implementation of the `guard` method will define the behavior of the guardrail.
+    Attributes:
+        None
+    Methods:
+        guard(prompt: str, **kwargs) -> list[str]:
+            Abstract method that must be implemented by subclasses. It takes a
+            prompt string and additional keyword arguments, and returns a list
+            of strings.
+    """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

guardrails_genie/guardrails/injection/classifier_guardrail.py CHANGED Viewed

@@ -11,6 +11,15 @@ from ..base import Guardrail
 class PromptInjectionClassifierGuardrail(Guardrail):
     model_name: str = "ProtectAI/deberta-v3-base-prompt-injection-v2"
     _classifier: Optional[Pipeline] = None
@@ -39,6 +48,24 @@ class PromptInjectionClassifierGuardrail(Guardrail):
     @weave.op()
     def guard(self, prompt: str):
         response = self.classify(prompt)
         confidence_percentage = round(response[0]["score"] * 100, 2)
         return {

 class PromptInjectionClassifierGuardrail(Guardrail):
+    """
+    A guardrail that uses a pre-trained text-classification model to classify prompts
+    for potential injection attacks.
+    Args:
+        model_name (str): The name of the HuggingFace model or a WandB
+            checkpoint artifact path to use for classification.
+    """
     model_name: str = "ProtectAI/deberta-v3-base-prompt-injection-v2"
     _classifier: Optional[Pipeline] = None
     @weave.op()
     def guard(self, prompt: str):
+        """
+        Analyzes the given prompt to determine if it is safe or potentially an injection attack.
+        This function uses a pre-trained text-classification model to classify the prompt.
+        It calls the `classify` method to get the classification result, which includes a label
+        and a confidence score. The function then calculates the confidence percentage and
+        returns a dictionary with two keys:
+        - "safe": A boolean indicating whether the prompt is safe (True) or an injection (False).
+        - "summary": A string summarizing the classification result, including the label and the
+          confidence percentage.
+        Args:
+            prompt (str): The input prompt to be classified.
+        Returns:
+            dict: A dictionary containing the safety status and a summary of the classification result.
+        """
         response = self.classify(prompt)
         confidence_percentage = round(response[0]["score"] * 100, 2)
         return {

guardrails_genie/guardrails/injection/survey_guardrail.py CHANGED Viewed

@@ -16,10 +16,32 @@ class SurveyGuardrailResponse(BaseModel):
 class PromptInjectionSurveyGuardrail(Guardrail):
     llm_model: OpenAIModel
     @weave.op()
     def load_prompt_injection_survey(self) -> str:
         prompt_injection_survey_path = os.path.join(
             os.getcwd(), "prompts", "injection_paper_1.md"
         )
@@ -30,6 +52,30 @@ class PromptInjectionSurveyGuardrail(Guardrail):
     @weave.op()
     def format_prompts(self, prompt: str) -> str:
         markdown_text = self.load_prompt_injection_survey()
         user_prompt = f"""You are given the following research papers as reference:\n\n{markdown_text}"""
         user_prompt += f"""
@@ -62,6 +108,21 @@ Here are some strict instructions that you must follow:
     @weave.op()
     def predict(self, prompt: str, **kwargs) -> list[str]:
         user_prompt, system_prompt = self.format_prompts(prompt)
         chat_completion = self.llm_model.predict(
             user_prompts=user_prompt,
@@ -74,6 +135,22 @@ Here are some strict instructions that you must follow:
     @weave.op()
     def guard(self, prompt: str, **kwargs) -> list[str]:
         response = self.predict(prompt, **kwargs)
         summary = (
             f"Prompt is deemed safe. {response.explanation}"

 class PromptInjectionSurveyGuardrail(Guardrail):
+    """
+    A guardrail that uses a summarized version of the research paper
+    [An Early Categorization of Prompt Injection Attacks on Large Language Models](https://arxiv.org/abs/2402.00898)
+    to assess whether a prompt is a prompt injection attack or not.
+    Args:
+        llm_model (OpenAIModel): The LLM model to use for the guardrail.
+    """
     llm_model: OpenAIModel
     @weave.op()
     def load_prompt_injection_survey(self) -> str:
+        """
+        Loads the prompt injection survey content from a markdown file, wraps it in
+        `<research_paper>...</research_paper>` tags, and returns it as a string.
+        This function constructs the file path to the markdown file containing the
+        summarized research paper on prompt injection attacks. It reads the content
+        of the file, wraps it in <research_paper> tags, and returns the formatted
+        string. This formatted content is used as a reference in the prompt
+        assessment process.
+        Returns:
+            str: The content of the prompt injection survey wrapped in <research_paper> tags.
+        """
         prompt_injection_survey_path = os.path.join(
             os.getcwd(), "prompts", "injection_paper_1.md"
         )
     @weave.op()
     def format_prompts(self, prompt: str) -> str:
+        """
+        Formats the user and system prompts for assessing potential prompt injection attacks.
+        This function constructs two types of prompts: a user prompt and a system prompt.
+        The user prompt includes the content of a research paper on prompt injection attacks,
+        which is loaded using the `load_prompt_injection_survey` method. This content is
+        wrapped in a specific format to serve as a reference for the assessment process.
+        The user prompt also includes the input prompt that needs to be evaluated for
+        potential injection attacks, enclosed within <input_prompt> tags.
+        The system prompt provides detailed instructions to an expert system on how to
+        analyze the input prompt. It specifies that the system should use the research
+        papers as a reference to determine if the input prompt is a prompt injection attack,
+        and if so, classify it as a direct or indirect attack and identify the specific type.
+        The system is instructed to provide a detailed explanation of its assessment,
+        citing specific parts of the research papers, and to follow strict guidelines
+        to ensure accuracy and clarity.
+        Args:
+            prompt (str): The input prompt to be assessed for potential injection attacks.
+        Returns:
+            tuple: A tuple containing the formatted user prompt and system prompt.
+        """
         markdown_text = self.load_prompt_injection_survey()
         user_prompt = f"""You are given the following research papers as reference:\n\n{markdown_text}"""
         user_prompt += f"""
     @weave.op()
     def predict(self, prompt: str, **kwargs) -> list[str]:
+        """
+        Predicts whether the given input prompt is a prompt injection attack.
+        This function formats the user and system prompts using the `format_prompts` method,
+        which includes the content of research papers and the input prompt to be assessed.
+        It then uses the `llm_model` to predict the nature of the input prompt by providing
+        the formatted prompts and expecting a response in the `SurveyGuardrailResponse` format.
+        Args:
+            prompt (str): The input prompt to be assessed for potential injection attacks.
+            **kwargs: Additional keyword arguments to be passed to the `llm_model.predict` method.
+        Returns:
+            list[str]: The parsed response from the model, indicating the assessment of the input prompt.
+        """
         user_prompt, system_prompt = self.format_prompts(prompt)
         chat_completion = self.llm_model.predict(
             user_prompts=user_prompt,
     @weave.op()
     def guard(self, prompt: str, **kwargs) -> list[str]:
+        """
+        Assesses the given input prompt for potential prompt injection attacks and provides a summary.
+        This function uses the `predict` method to determine whether the input prompt is a prompt injection attack.
+        It then constructs a summary based on the prediction, indicating whether the prompt is safe or an attack.
+        If the prompt is deemed an attack, the summary specifies whether it is a direct or indirect attack and the type of attack.
+        Args:
+            prompt (str): The input prompt to be assessed for potential injection attacks.
+            **kwargs: Additional keyword arguments to be passed to the `predict` method.
+        Returns:
+            dict: A dictionary containing:
+                - "safe" (bool): Indicates whether the prompt is safe (True) or an injection attack (False).
+                - "summary" (str): A summary of the assessment, including the type of attack and explanation if applicable.
+        """
         response = self.predict(prompt, **kwargs)
         summary = (
             f"Prompt is deemed safe. {response.explanation}"

guardrails_genie/guardrails/manager.py CHANGED Viewed

@@ -6,10 +6,44 @@ from .base import Guardrail
 class GuardrailManager(weave.Model):
     guardrails: list[Guardrail]
     @weave.op()
     def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
         alerts, summaries, safe = [], "", True
         iterable = (
             track(self.guardrails, description="Running guardrails")
@@ -31,4 +65,25 @@ class GuardrailManager(weave.Model):
     @weave.op()
     def predict(self, prompt: str, **kwargs) -> dict:
         return self.guard(prompt, progress_bar=False, **kwargs)

 class GuardrailManager(weave.Model):
+    """
+    GuardrailManager is responsible for managing and executing a series of guardrails
+    on a given prompt. It utilizes the `weave` framework to define operations that
+    can be applied to the guardrails.
+    Attributes:
+        guardrails (list[Guardrail]): A list of Guardrail objects that define the
+            rules and checks to be applied to the input prompt.
+    """
     guardrails: list[Guardrail]
     @weave.op()
     def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
+        """
+        Execute a series of guardrails on a given prompt and return the results.
+        This method iterates over a list of Guardrail objects, applying each guardrail's
+        `guard` method to the provided prompt. It collects responses from each guardrail
+        and compiles them into a summary report. The function also determines the overall
+        safety of the prompt based on the responses from the guardrails.
+        Args:
+            prompt (str): The input prompt to be evaluated by the guardrails.
+            progress_bar (bool, optional): If True, displays a progress bar while
+                processing the guardrails. Defaults to True.
+            **kwargs: Additional keyword arguments to be passed to each guardrail's
+                `guard` method.
+        Returns:
+            dict: A dictionary containing:
+                - "safe" (bool): Indicates whether the prompt is considered safe
+                  based on the guardrails' evaluations.
+                - "alerts" (list): A list of dictionaries, each containing the name
+                  of the guardrail and its response.
+                - "summary" (str): A formatted string summarizing the results of
+                  each guardrail's evaluation.
+        """
         alerts, summaries, safe = [], "", True
         iterable = (
             track(self.guardrails, description="Running guardrails")
     @weave.op()
     def predict(self, prompt: str, **kwargs) -> dict:
+        """
+        Predicts the safety and potential issues of a given input prompt using the guardrails.
+        This function serves as a wrapper around the `guard` method, providing a simplified
+        interface for evaluating the input prompt without displaying a progress bar. It
+        applies a series of guardrails to the prompt and returns a detailed assessment.
+        Args:
+            prompt (str): The input prompt to be evaluated by the guardrails.
+            **kwargs: Additional keyword arguments to be passed to each guardrail's
+                `guard` method.
+        Returns:
+            dict: A dictionary containing:
+                - "safe" (bool): Indicates whether the prompt is considered safe
+                  based on the guardrails' evaluations.
+                - "alerts" (list): A list of dictionaries, each containing the name
+                  of the guardrail and its response.
+                - "summary" (str): A formatted string summarizing the results of
+                  each guardrail's evaluation.
+        """
         return self.guard(prompt, progress_bar=False, **kwargs)

guardrails_genie/llm.py CHANGED Viewed

@@ -10,13 +10,14 @@ class OpenAIModel(weave.Model):
     A class to interface with OpenAI's language models using the Weave framework.
     This class provides methods to create structured messages and generate predictions
-    using OpenAI's chat completion API. It is designed to work with both single and
-    multiple user prompts, and optionally includes a system prompt to guide the model's
     responses.
     Args:
         model_name (str): The name of the OpenAI model to be used for predictions.
     """
     model_name: str
     _openai_client: OpenAI
@@ -34,19 +35,19 @@ class OpenAIModel(weave.Model):
         """
         Create a list of messages for the OpenAI chat completion API.
-        This function constructs a list of messages in the format required by the
-        OpenAI chat completion API. It takes user prompts, an optional system prompt,
-        and an optional list of existing messages, and combines them into a single
         list of messages.
         Args:
-            user_prompts (Union[str, list[str]]): A single user prompt or a list of
                 user prompts to be included in the messages.
-            system_prompt (Optional[str]): An optional system prompt to guide the
-                model's responses. If provided, it will be added at the beginning
                 of the messages list.
-            messages (Optional[list[dict]]): An optional list of existing messages
-                to which the new prompts will be appended. If not provided, a new
                 list will be created.
         Returns:
@@ -71,21 +72,21 @@ class OpenAIModel(weave.Model):
         """
         Generate a chat completion response using the OpenAI API.
-        This function takes user prompts, an optional system prompt, and an optional
-        list of existing messages to create a list of messages formatted for the
-        OpenAI chat completion API. It then sends these messages to the OpenAI API
         to generate a chat completion response.
         Args:
-            user_prompts (Union[str, list[str]]): A single user prompt or a list of
                 user prompts to be included in the messages.
-            system_prompt (Optional[str]): An optional system prompt to guide the
-                model's responses. If provided, it will be added at the beginning
                 of the messages list.
-            messages (Optional[list[dict]]): An optional list of existing messages
-                to which the new prompts will be appended. If not provided, a new
                 list will be created.
-            **kwargs: Additional keyword arguments to be passed to the OpenAI API
                 for chat completion.
         Returns:

     A class to interface with OpenAI's language models using the Weave framework.
     This class provides methods to create structured messages and generate predictions
+    using OpenAI's chat completion API. It is designed to work with both single and
+    multiple user prompts, and optionally includes a system prompt to guide the model's
     responses.
     Args:
         model_name (str): The name of the OpenAI model to be used for predictions.
     """
     model_name: str
     _openai_client: OpenAI
         """
         Create a list of messages for the OpenAI chat completion API.
+        This function constructs a list of messages in the format required by the
+        OpenAI chat completion API. It takes user prompts, an optional system prompt,
+        and an optional list of existing messages, and combines them into a single
         list of messages.
         Args:
+            user_prompts (Union[str, list[str]]): A single user prompt or a list of
                 user prompts to be included in the messages.
+            system_prompt (Optional[str]): An optional system prompt to guide the
+                model's responses. If provided, it will be added at the beginning
                 of the messages list.
+            messages (Optional[list[dict]]): An optional list of existing messages
+                to which the new prompts will be appended. If not provided, a new
                 list will be created.
         Returns:
         """
         Generate a chat completion response using the OpenAI API.
+        This function takes user prompts, an optional system prompt, and an optional
+        list of existing messages to create a list of messages formatted for the
+        OpenAI chat completion API. It then sends these messages to the OpenAI API
         to generate a chat completion response.
         Args:
+            user_prompts (Union[str, list[str]]): A single user prompt or a list of
                 user prompts to be included in the messages.
+            system_prompt (Optional[str]): An optional system prompt to guide the
+                model's responses. If provided, it will be added at the beginning
                 of the messages list.
+            messages (Optional[list[dict]]): An optional list of existing messages
+                to which the new prompts will be appended. If not provided, a new
                 list will be created.
+            **kwargs: Additional keyword arguments to be passed to the OpenAI API
                 for chat completion.
         Returns:

mkdocs.yml CHANGED Viewed

@@ -59,6 +59,12 @@ extra_javascript:
 nav:
   - Home: 'index.md'
   - LLM: 'llm.md'
   - Metrics: 'metrics.md'
   - RegexModel: 'regex_model.md'

 nav:
   - Home: 'index.md'
+  - Guardrails:
+    - Guardrail Base Class: 'guardrails/base.md'
+    - Guardrail Manager: 'guardrails/manager.md'
+    - Prompt Injection Guardrails:
+      - Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
+      - Survey Guardrail: 'guardrails/prompt_injection/llm_survey.md'
   - LLM: 'llm.md'
   - Metrics: 'metrics.md'
   - RegexModel: 'regex_model.md'