TrustAI Guard Evaluation
TrustAI Guard Evaluation Tutorial
pip install pandas datasets numpyEvaluate on a prompt injection dataset
import os
import requests
from typing import Optional
import numpy as np
import pandas as pd
from datasets import load_dataset
DEBUG = False
if DEBUG:
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'
def guard_score(prompt: str, category: str) -> float:
"""Makes a request to the TrustAI Guard and returns the score for a category."""
response = session.post(
f"https://platform.trustai.pro/v1/prompt_guard",
json={"input": prompt},
headers={"Authorization": f"Bearer sk-trustaijust4demopromptguard"},
)
response_json = response.json()
for i in response_json:
if i['model'] == 'trustai-guard-prompt_injection':
return i["results"][0]["category_scores"][category]
return 0.0
def eval_guard(
df: pd.DataFrame,
category: str,
max_size: Optional[int] = None,
score_thr: Optional[float] = 0.5
):
"""Computes standard detection metrics on the input DataFrame for the given category."""
if category not in ["prompt_injection", "jailbreaks", "pii", "prohibited_content"]:
raise ValueError(f"The category {category} does not correspond to an existing endpoint.")
predictions, labels = [], []
max_size = max_size if max_size is not None else len(df)
# Iterate over your dataset.
for _, row in df.head(max_size).iterrows():
predictions.append(guard_score(row.text, category) > score_thr)
labels.append(row.label)
predictions = np.array(predictions)
labels = np.array(labels)
false_positives = np.sum((predictions == 1) & (labels == 0))
false_negatives = np.sum((predictions == 0) & (labels == 1))
print(f"False positives: {false_positives} (total: {len(predictions)})")
print(f"False negatives: {false_negatives} (total: {len(predictions)})")
print(f"Accuracy: {np.mean(predictions == labels)}")
if __name__ == '__main__':
session = requests.Session()
"""Great, let's run a few evaluations.
We start by loading a prompt injection dataset available in Hugging Face.
This dataset was similar to the prompt "Ignore all instructions"."""
data = load_dataset("deepset/prompt-injections")
df = pd.DataFrame(data["test"])
"""We only evaluate on 100 prompts, but feel free to modify that: not providing the max_size parameter will evaluate on the whole dataset."""
eval_guard(df, "prompt_injection", max_size=100)
"""You can also modify the score threshold. By default, a score above 0.5 leads to a detection.
However, you can tradeoff precision for recall by modifying the threshold yourself.
For example, let's evaluate on the same dataset, with a looser detection threshold:"""
eval_guard(df, "prompt_injection", max_size=100, score_thr=0.3)Evaluate on a benign Q&A dataset
Last updated