PII Loss Tutorial
import os
import requests
from typing import Optional
import numpy as np
import pandas as pd
from datasets import load_dataset
Proxy = False
if DEBUG:
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'
def guard_score(prompt: str, category: str) -> float:
"""Makes a request to the TrustAI Guard and returns the score for a category."""
response = session.post(
f"https://platform.trustai.pro/v1/prompt_guard",
json={"input": prompt},
headers={"Authorization": f"Bearer sk-trustaijust4demopromptguard"},
)
response_json = response.json()
for i in response_json:
if i['model'] == 'trustai-guard-PII':
return i["results"][0]["category_scores"][category]
def eval_guard(
df: pd.DataFrame,
category: str,
max_size: Optional[int] = None,
score_thr: Optional[float] = 0.5
):
"""Computes standard detection metrics on the input DataFrame for the given category."""
if category not in ["prompt_injection", "jailbreaks", "pii", "prohibited_content"]:
raise ValueError(f"The category {category} does not correspond to an existing endpoint.")
predictions, labels = [], []
max_size = max_size if max_size is not None else len(df)
# Iterate over your dataset.
for _, row in df.head(max_size).iterrows():
predictions.append(guard_score(row.text, category) > score_thr)
labels.append(row.label)
predictions = np.array(predictions)
labels = np.array(labels)
false_positives = np.sum((predictions == 1) & (labels == 0))
false_negatives = np.sum((predictions == 0) & (labels == 1))
print(f"False positives: {false_positives} (total: {len(predictions)})")
print(f"False negatives: {false_negatives} (total: {len(predictions)})")
print(f"Accuracy: {np.mean(predictions == labels)}")
if __name__ == '__main__':
session = requests.Session()
data = load_dataset("dddd322/dataleak")
df = pd.DataFrame(data["train"])
df["label"] = 1
df.rename(columns={"input": "text"}, inplace=True)
for _, row in df.head(5).iterrows():
print(row.text)
print()
eval_guard(df, "pii", max_size=100)

Last updated