TrustAI Guard Evaluation

TrustAI Guard Evaluation Tutorial

TrustAI Guard has undergone rigorous baseline evaluation, and at the same time, we will show you how to evaluate the performance on different categories on a few datasets and provide you with the framework to evaluate it on your own datasets.

First, we need to install a few dependencies：

pip install pandas datasets numpy

Evaluate on a prompt injection dataset

Here is a simple example code：

import os
import requests
from typing import Optional
import numpy as np
import pandas as pd
from datasets import load_dataset


DEBUG = False
if DEBUG:
    os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
    os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'


def guard_score(prompt: str, category: str) -> float:
    """Makes a request to the TrustAI Guard and returns the score for a category."""
    response = session.post(
        f"https://platform.trustai.pro/v1/prompt_guard",
        json={"input": prompt},
        headers={"Authorization": f"Bearer sk-trustaijust4demopromptguard"},
    )
    response_json = response.json()
    for i in response_json:
        if i['model'] == 'trustai-guard-prompt_injection':
            return i["results"][0]["category_scores"][category]

    return 0.0


def eval_guard(
    df: pd.DataFrame,
    category: str,
    max_size: Optional[int] = None,
    score_thr: Optional[float] = 0.5
):
    """Computes standard detection metrics on the input DataFrame for the given category."""

    if category not in ["prompt_injection", "jailbreaks", "pii", "prohibited_content"]:
        raise ValueError(f"The category {category} does not correspond to an existing endpoint.")

    predictions, labels = [], []

    max_size = max_size if max_size is not None else len(df)
    # Iterate over your dataset.
    for _, row in df.head(max_size).iterrows():
        predictions.append(guard_score(row.text, category) > score_thr)
        labels.append(row.label)

    predictions = np.array(predictions)
    labels = np.array(labels)

    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))

    print(f"False positives: {false_positives} (total: {len(predictions)})")
    print(f"False negatives: {false_negatives} (total: {len(predictions)})")
    print(f"Accuracy: {np.mean(predictions == labels)}")


if __name__ == '__main__':
    session = requests.Session()
    """Great, let's run a few evaluations. 
    We start by loading a prompt injection dataset available in Hugging Face. 
    This dataset was similar to the prompt "Ignore all instructions"."""
    data = load_dataset("deepset/prompt-injections")
    df = pd.DataFrame(data["test"])
    """We only evaluate on 100 prompts, but feel free to modify that: not providing the max_size parameter will evaluate on the whole dataset."""
    eval_guard(df, "prompt_injection", max_size=100)
    """You can also modify the score threshold. By default, a score above 0.5 leads to a detection. 
    However, you can tradeoff precision for recall by modifying the threshold yourself. 
    For example, let's evaluate on the same dataset, with a looser detection threshold:"""
    eval_guard(df, "prompt_injection", max_size=100, score_thr=0.3)

Evaluate on a benign Q&A dataset

For LLM protection to be useful in a real use case, the Risk Classifier should not trigger on benign data. Indeed, significant false alarms can have strong effects on user experience.

In the following, we evaluate the TrustAI Guard on a dataset of benign questions. We use the CS-EVAL dataset from Hugging Face.

import os
import requests
from typing import Optional
import numpy as np
import pandas as pd
from datasets import load_dataset


DEBUG = False
if DEBUG:
    os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
    os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'


def guard_score(prompt: str, category: str) -> float:
    """Makes a request to the TrustAI Guard and returns the score for a category."""
    response = session.post(
        f"https://platform.trustai.pro/v1/prompt_guard",
        json={"input": prompt},
        headers={"Authorization": f"Bearer sk-trustaijust4demopromptguard"},
    )
    response_json = response.json()
    for i in response_json:
        if i['model'] == 'trustai-guard-prompt_injection':
            return i["results"][0]["category_scores"][category]

    return 0.0


def eval_guard(
    df: pd.DataFrame,
    category: str,
    max_size: Optional[int] = None,
    score_thr: Optional[float] = 0.5
):
    """Computes standard detection metrics on the input DataFrame for the given category."""

    if category not in ["prompt_injection", "jailbreaks", "pii", "prohibited_content"]:
        raise ValueError(f"The category {category} does not correspond to an existing endpoint.")

    predictions, labels = [], []

    max_size = max_size if max_size is not None else len(df)
    # Iterate over your dataset.
    for _, row in df.head(max_size).iterrows():
        predictions.append(guard_score(row.text, category) > score_thr)
        labels.append(row.label)

    predictions = np.array(predictions)
    labels = np.array(labels)

    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))

    print(f"False positives: {false_positives} (total: {len(predictions)})")
    print(f"False negatives: {false_negatives} (total: {len(predictions)})")
    print(f"Accuracy: {np.mean(predictions == labels)}")


if __name__ == '__main__':
    session = requests.Session()
    """Great, let's run a few evaluations."""
    data = load_dataset("cseval/cs-eval")
    df = pd.DataFrame(data["test"])
    df["label"] = 0
    df.rename(columns={"prompt": "text"}, inplace=True)
    eval_guard(df, "prompt_injection", max_size=100)

PreviousQ&A RAG Tutorial NextAPI Reference

Last updated 11 months ago