PII Loss Tutorial

This article, we evaluate the TrustAI Guard on a PII dataset to show how this evaluation framework works with any other GenAI Applications.

Here, we use a PII dataset from Hugging Face. We set the label to positive, since we expect PII to be present in all the rows.

import os
import requests
from typing import Optional
import numpy as np
import pandas as pd
from datasets import load_dataset


Proxy = False
if DEBUG:
    os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
    os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'


def guard_score(prompt: str, category: str) -> float:
    """Makes a request to the TrustAI Guard and returns the score for a category."""
    response = session.post(
        f"https://platform.trustai.pro/v1/prompt_guard",
        json={"input": prompt},
        headers={"Authorization": f"Bearer sk-trustaijust4demopromptguard"},
    )
    response_json = response.json()
    for i in response_json:
        if i['model'] == 'trustai-guard-PII':
            return i["results"][0]["category_scores"][category]


def eval_guard(
    df: pd.DataFrame,
    category: str,
    max_size: Optional[int] = None,
    score_thr: Optional[float] = 0.5
):
    """Computes standard detection metrics on the input DataFrame for the given category."""

    if category not in ["prompt_injection", "jailbreaks", "pii", "prohibited_content"]:
        raise ValueError(f"The category {category} does not correspond to an existing endpoint.")

    predictions, labels = [], []

    max_size = max_size if max_size is not None else len(df)
    # Iterate over your dataset.
    for _, row in df.head(max_size).iterrows():
        predictions.append(guard_score(row.text, category) > score_thr)
        labels.append(row.label)

    predictions = np.array(predictions)
    labels = np.array(labels)

    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))

    print(f"False positives: {false_positives} (total: {len(predictions)})")
    print(f"False negatives: {false_negatives} (total: {len(predictions)})")
    print(f"Accuracy: {np.mean(predictions == labels)}")


if __name__ == '__main__':
    session = requests.Session()
    data = load_dataset("dddd322/dataleak")
    df = pd.DataFrame(data["train"])
    df["label"] = 1
    df.rename(columns={"input": "text"}, inplace=True)
    for _, row in df.head(5).iterrows():
        print(row.text)
        print()
    eval_guard(df, "pii", max_size=100)

Depending on the size of the test data set, you will get the evaluation results after waiting for a few seconds.

PreviousToxic Generation Tutorial NextQ&A RAG Tutorial

Last updated 11 months ago