param-bharat commited on
Commit
63bfd18
1 Parent(s): f4dbf56

feat: add secrets detection benchmarking script

Browse files
Files changed (1) hide show
  1. benchmarks/secrets_benchmark.py +166 -0
benchmarks/secrets_benchmark.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from typing import Any
3
+
4
+ import weave
5
+ from guardrails import Guard
6
+ from guardrails.hub import SecretsPresent
7
+ from llm_guard.input_scanners import Secrets
8
+ from llm_guard.util import configure_logger
9
+
10
+ from guardrails_genie.guardrails import GuardrailManager
11
+ from guardrails_genie.guardrails.base import Guardrail
12
+ from guardrails_genie.guardrails.secrets_detection import (
13
+ SecretsDetectionResponse,
14
+ SecretsDetectionSimpleResponse,
15
+ SecretsDetectionGuardrail,
16
+ )
17
+ from guardrails_genie.metrics import AccuracyMetric
18
+
19
+ logger = configure_logger(log_level="ERROR")
20
+
21
+
22
+ class GuardrailsAISecretsDetector(Guardrail):
23
+ validator: Any
24
+
25
+ def __init__(self):
26
+ validator = Guard().use(SecretsPresent, on_fail="fix")
27
+ super().__init__(validator=validator)
28
+
29
+ def scan(self, text: str) -> dict:
30
+ response = self.validator.validate(text)
31
+ if response.validation_summaries:
32
+ summary = response.validation_summaries[0]
33
+ return {
34
+ "has_secret": True,
35
+ "detected_secrets": {
36
+ str(k): v
37
+ for k, v in enumerate(
38
+ summary.failure_reason.splitlines()[1:], start=1
39
+ )
40
+ },
41
+ "explanation": summary.failure_reason,
42
+ "modified_prompt": response.validated_output,
43
+ "risk_score": 1.0,
44
+ }
45
+ else:
46
+ return {
47
+ "has_secret": False,
48
+ "detected_secrets": None,
49
+ "explanation": "No secrets detected in the text.",
50
+ "modified_prompt": response.validated_output,
51
+ "risk_score": 0.0,
52
+ }
53
+
54
+ @weave.op
55
+ def guard(
56
+ self,
57
+ prompt: str,
58
+ return_detected_secrets: bool = True,
59
+ **kwargs,
60
+ ) -> SecretsDetectionResponse | SecretsDetectionResponse:
61
+ results = self.scan(prompt)
62
+
63
+ if return_detected_secrets:
64
+ return SecretsDetectionResponse(
65
+ contains_secrets=results["has_secret"],
66
+ detected_secrets=results["detected_secrets"],
67
+ explanation=results["explanation"],
68
+ redacted_text=results["modified_prompt"],
69
+ risk_score=results["risk_score"],
70
+ )
71
+ else:
72
+ return SecretsDetectionSimpleResponse(
73
+ contains_secrets=not results["has_secret"],
74
+ explanation=results["explanation"],
75
+ redacted_text=results["modified_prompt"],
76
+ risk_score=results["risk_score"],
77
+ )
78
+
79
+
80
+ class LLMGuardSecretsDetector(Guardrail):
81
+ validator: Any
82
+
83
+ def __init__(self):
84
+ validator = Secrets(redact_mode="all")
85
+ super().__init__(validator=validator)
86
+
87
+ def scan(self, text: str) -> dict:
88
+ sanitized_prompt, is_valid, risk_score = self.validator.scan(text)
89
+ if is_valid:
90
+ return {
91
+ "has_secret": not is_valid,
92
+ "detected_secrets": None,
93
+ "explanation": "No secrets detected in the text.",
94
+ "modified_prompt": sanitized_prompt,
95
+ "risk_score": risk_score,
96
+ }
97
+ else:
98
+ return {
99
+ "has_secret": not is_valid,
100
+ "detected_secrets": {},
101
+ "explanation": "This library does not return detected secrets.",
102
+ "modified_prompt": sanitized_prompt,
103
+ "risk_score": risk_score,
104
+ }
105
+
106
+ @weave.op
107
+ def guard(
108
+ self,
109
+ prompt: str,
110
+ return_detected_secrets: bool = True,
111
+ **kwargs,
112
+ ) -> SecretsDetectionResponse | SecretsDetectionResponse:
113
+ results = self.scan(prompt)
114
+ if return_detected_secrets:
115
+ return SecretsDetectionResponse(
116
+ contains_secrets=results["has_secret"],
117
+ detected_secrets=results["detected_secrets"],
118
+ explanation=results["explanation"],
119
+ redacted_text=results["modified_prompt"],
120
+ risk_score=results["risk_score"],
121
+ )
122
+ else:
123
+ return SecretsDetectionSimpleResponse(
124
+ contains_secrets=not results["has_secret"],
125
+ explanation=results["explanation"],
126
+ redacted_text=results["modified_prompt"],
127
+ risk_score=results["risk_score"],
128
+ )
129
+
130
+
131
+ def main():
132
+ client = weave.init("parambharat/secrets-detection")
133
+ dataset = weave.ref("secrets-detection-benchmark:latest").get()
134
+ llm_guard_guardrail = LLMGuardSecretsDetector()
135
+ guardrails_ai_guardrail = GuardrailsAISecretsDetector()
136
+ guardrails_genie_guardrail = SecretsDetectionGuardrail()
137
+
138
+ all_guards = [
139
+ llm_guard_guardrail,
140
+ guardrails_ai_guardrail,
141
+ guardrails_genie_guardrail,
142
+ ]
143
+ evaluation = weave.Evaluation(
144
+ dataset=dataset.rows,
145
+ scorers=[AccuracyMetric()],
146
+ )
147
+
148
+ for guard in all_guards:
149
+ name = guard.__class__.__name__
150
+ guardrail_manager = GuardrailManager(
151
+ guardrails=[
152
+ guard,
153
+ ]
154
+ )
155
+
156
+ results = asyncio.run(
157
+ evaluation.evaluate(
158
+ guardrail_manager,
159
+ __weave={"display_name": f"{name}"},
160
+ )
161
+ )
162
+ print(results)
163
+
164
+
165
+ if __name__ == "__main__":
166
+ main()