Spaces:
Running
Running
geekyrakshit
commited on
Commit
·
1d117f2
1
Parent(s):
3ed3941
add: off-the-shelf prompt injection guardrail
Browse files
guardrails_genie/guardrails/__init__.py
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
-
from .injection import PromptInjectionSurveyGuardrail
|
2 |
from .manager import GuardrailManager
|
3 |
|
4 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
1 |
+
from .injection import PromptInjectionProtectAIGuardrail, PromptInjectionSurveyGuardrail
|
2 |
from .manager import GuardrailManager
|
3 |
|
4 |
+
__all__ = [
|
5 |
+
"PromptInjectionSurveyGuardrail",
|
6 |
+
"PromptInjectionProtectAIGuardrail",
|
7 |
+
"GuardrailManager",
|
8 |
+
]
|
guardrails_genie/guardrails/injection/__init__.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from .survey_guardrail import PromptInjectionSurveyGuardrail
|
2 |
|
3 |
-
__all__ = ["PromptInjectionSurveyGuardrail"]
|
|
|
1 |
+
from .protectai_guardrail import PromptInjectionProtectAIGuardrail
|
2 |
from .survey_guardrail import PromptInjectionSurveyGuardrail
|
3 |
|
4 |
+
__all__ = ["PromptInjectionSurveyGuardrail", "PromptInjectionProtectAIGuardrail"]
|
guardrails_genie/guardrails/injection/protectai_guardrail.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import weave
|
5 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
6 |
+
from transformers.pipelines.base import Pipeline
|
7 |
+
|
8 |
+
from ..base import Guardrail
|
9 |
+
|
10 |
+
|
11 |
+
class PromptInjectionProtectAIGuardrail(Guardrail):
|
12 |
+
model_name: str = "ProtectAI/deberta-v3-base-prompt-injection-v2"
|
13 |
+
_classifier: Optional[Pipeline] = None
|
14 |
+
|
15 |
+
def model_post_init(self, __context):
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
17 |
+
model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
|
18 |
+
self._classifier = pipeline(
|
19 |
+
"text-classification",
|
20 |
+
model=model,
|
21 |
+
tokenizer=tokenizer,
|
22 |
+
truncation=True,
|
23 |
+
max_length=512,
|
24 |
+
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
|
25 |
+
)
|
26 |
+
|
27 |
+
@weave.op()
|
28 |
+
def predict(self, prompt: str):
|
29 |
+
return self._classifier(prompt)
|
30 |
+
|
31 |
+
@weave.op()
|
32 |
+
def guard(self, prompt: str):
|
33 |
+
response = self.predict(prompt)
|
34 |
+
return {"safe": response[0]["label"] != "INJECTION"}
|