geekyrakshit commited on
Commit
b077b7d
·
1 Parent(s): a1c5338

add: guardrails manager

Browse files
guardrails_genie/guardrails/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
  from .injection import SurveyGuardrail
 
2
 
3
- __all__ = ["SurveyGuardrail"]
 
1
  from .injection import SurveyGuardrail
2
+ from .manager import GuardrailManager
3
 
4
+ __all__ = ["SurveyGuardrail", "GuardrailManager"]
guardrails_genie/guardrails/base.py CHANGED
@@ -11,7 +11,3 @@ class Guardrail(weave.Model):
11
  @weave.op()
12
  def guard(self, prompt: str, **kwargs) -> list[str]:
13
  pass
14
-
15
- @weave.op()
16
- def predict(self, prompt: str, **kwargs) -> list[str]:
17
- return self.guard(prompt, **kwargs)
 
11
  @weave.op()
12
  def guard(self, prompt: str, **kwargs) -> list[str]:
13
  pass
 
 
 
 
guardrails_genie/guardrails/injection/survey_guardrail.py CHANGED
@@ -17,7 +17,7 @@ class SurveyGuardrailResponse(BaseModel):
17
 
18
  class SurveyGuardrail(Guardrail):
19
  llm_model: OpenAIModel
20
-
21
  @weave.op()
22
  def load_prompt_injection_survey(self) -> str:
23
  prompt_injection_survey_path = os.path.join(
@@ -61,7 +61,7 @@ Here are some strict instructions that you must follow:
61
  return user_prompt, system_prompt
62
 
63
  @weave.op()
64
- def guard(self, prompt: str, **kwargs) -> list[str]:
65
  user_prompt, system_prompt = self.format_prompts(prompt)
66
  chat_completion = self.llm_model.predict(
67
  user_prompts=user_prompt,
@@ -70,3 +70,8 @@ Here are some strict instructions that you must follow:
70
  **kwargs,
71
  )
72
  return chat_completion.choices[0].message.parsed
 
 
 
 
 
 
17
 
18
  class SurveyGuardrail(Guardrail):
19
  llm_model: OpenAIModel
20
+
21
  @weave.op()
22
  def load_prompt_injection_survey(self) -> str:
23
  prompt_injection_survey_path = os.path.join(
 
61
  return user_prompt, system_prompt
62
 
63
  @weave.op()
64
+ def predict(self, prompt: str, **kwargs) -> list[str]:
65
  user_prompt, system_prompt = self.format_prompts(prompt)
66
  chat_completion = self.llm_model.predict(
67
  user_prompts=user_prompt,
 
70
  **kwargs,
71
  )
72
  return chat_completion.choices[0].message.parsed
73
+
74
+ @weave.op()
75
+ def guard(self, prompt: str, **kwargs) -> list[str]:
76
+ response = self.predict(prompt, **kwargs)
77
+ return {"verdict": response.injection_prompt}
guardrails_genie/guardrails/manager.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weave
2
+ from weave.flow.obj import Object as WeaveObject
3
+
4
+ from .base import Guardrail
5
+
6
+
7
+ class GuardrailManager(WeaveObject):
8
+ guardrails: list[Guardrail]
9
+
10
+ @weave.op()
11
+ def guard(self, prompt: str, **kwargs) -> dict:
12
+ alerts = []
13
+ for guardrail in self.guardrails:
14
+ response = guardrail.guard(prompt, **kwargs)
15
+ alerts.append({guardrail.name: response})
16
+ return alerts