-
Notifications
You must be signed in to change notification settings - Fork 747
Expand file tree
/
Copy pathor_bench_dataset.py
More file actions
147 lines (113 loc) · 4.46 KB
/
or_bench_dataset.py
File metadata and controls
147 lines (113 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
)
from pyrit.models import SeedDataset, SeedPrompt
logger = logging.getLogger(__name__)
class _ORBenchBaseDataset(_RemoteDatasetLoader):
"""
Base loader for OR-Bench datasets from HuggingFace.
Subclasses must set CONFIG, provide a dataset_name property, and a description.
References:
- https://huggingface.co/datasets/bench-llm/OR-Bench
- [@cui2024orbench]
License: CC BY 4.0
Warning: This dataset contains prompts designed to test over-refusal behavior in LLMs,
including potentially harmful and toxic content.
"""
HF_DATASET_NAME: str = "bench-llm/OR-Bench"
CONFIG: str
DESCRIPTION: str
def __init__(self, *, split: str = "train") -> None:
"""
Initialize the OR-Bench dataset loader.
Args:
split: Dataset split to load. Defaults to "train".
"""
self.split = split
async def fetch_dataset(self, *, cache: bool = True) -> SeedDataset:
"""
Fetch OR-Bench dataset from HuggingFace and return as SeedDataset.
Args:
cache: Whether to cache the fetched dataset. Defaults to True.
Returns:
SeedDataset: A SeedDataset containing the OR-Bench prompts.
"""
logger.info(f"Loading OR-Bench dataset from {self.HF_DATASET_NAME} (config={self.CONFIG})")
data = await self._fetch_from_huggingface(
dataset_name=self.HF_DATASET_NAME,
config=self.CONFIG,
split=self.split,
cache=cache,
)
authors = [
"Justin Cui",
"Wei-Lin Chiang",
"Ion Stoica",
"Cho-Jui Hsieh",
]
source_url = f"https://huggingface.co/datasets/{self.HF_DATASET_NAME}"
groups = ["UCLA", "UC Berkeley"]
seed_prompts = [
SeedPrompt(
value=item["prompt"],
data_type="text",
dataset_name=self.dataset_name,
harm_categories=[item["category"]] if item.get("category") else [],
description=self.DESCRIPTION,
source=source_url,
authors=authors,
groups=groups,
)
for item in data
]
logger.info(f"Successfully loaded {len(seed_prompts)} prompts from OR-Bench dataset")
return SeedDataset(seeds=seed_prompts, dataset_name=self.dataset_name)
class _ORBench80KDataset(_ORBenchBaseDataset):
"""
Loader for the OR-Bench 80K dataset.
Contains ~80k over-refusal prompts categorized into 10 common rejection categories.
This is the main comprehensive benchmark for evaluating LLM over-refusal behavior.
"""
CONFIG: str = "or-bench-80k"
DESCRIPTION: str = (
"OR-Bench 80K contains ~80k over-refusal prompts categorized into 10 rejection "
"categories. This is the main comprehensive benchmark for evaluating LLM over-refusal."
)
@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "or_bench_80k"
class _ORBenchHardDataset(_ORBenchBaseDataset):
"""
Loader for the OR-Bench Hard-1K dataset.
Contains ~1k challenging safe prompts that commonly trigger over-refusal in LLMs.
These are prompts that models should be able to answer without refusing.
"""
CONFIG: str = "or-bench-hard-1k"
DESCRIPTION: str = (
"OR-Bench Hard-1K contains ~1k challenging safe prompts that commonly trigger "
"over-refusal in language models. These prompts should be answerable without refusing."
)
@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "or_bench_hard"
class _ORBenchToxicDataset(_ORBenchBaseDataset):
"""
Loader for the OR-Bench Toxic dataset.
Contains toxic prompts that language models should correctly refuse.
Used as a contrast set to evaluate whether models can distinguish
genuinely harmful prompts from safe ones.
"""
CONFIG: str = "or-bench-toxic"
DESCRIPTION: str = (
"OR-Bench Toxic contains toxic prompts that language models should correctly refuse. "
"Used as a contrast set to evaluate refusal calibration."
)
@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "or_bench_toxic"