-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy path15_pii_extraction.py
More file actions
137 lines (117 loc) · 4.94 KB
/
15_pii_extraction.py
File metadata and controls
137 lines (117 loc) · 4.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
This example demonstrates how to create an agent that extracts and redacts Personal Identifiable
Information (PII) from text. It showcases:
1. Handling sensitive information with clear categorization
2. Structured output with both redacted text and extracted PII
3. Enum usage for PII categories
4. Comprehensive PII detection and redaction
"""
import asyncio
from enum import Enum
from pydantic import BaseModel, Field
import workflowai
from workflowai import Model
class PIIType(str, Enum):
"""Categories of Personal Identifiable Information."""
NAME = "NAME" # Full names, first names, last names
EMAIL = "EMAIL" # Email addresses
PHONE = "PHONE" # Phone numbers, fax numbers
ADDRESS = "ADDRESS" # Physical addresses, postal codes
SSN = "SSN" # Social Security Numbers, National IDs
DOB = "DOB" # Date of birth, age
FINANCIAL = "FINANCIAL" # Credit card numbers, bank accounts
LICENSE = "LICENSE" # Driver's license, professional licenses
URL = "URL" # Personal URLs, social media profiles
OTHER = "OTHER" # Other types of PII not covered above
class PIIExtraction(BaseModel):
"""Represents an extracted piece of PII with its type."""
text: str = Field(description="The extracted PII text")
type: PIIType = Field(description="The category of PII")
start_index: int = Field(description="Starting position in the original text")
end_index: int = Field(description="Ending position in the original text")
class PIIInput(BaseModel):
"""Input model for PII extraction."""
text: str = Field(
description="The text to analyze for PII",
examples=[
"Hi, I'm John Doe. You can reach me at john.doe@email.com or call 555-0123. "
"My SSN is 123-45-6789 and I live at 123 Main St, Springfield, IL 62701.",
],
)
class PIIOutput(BaseModel):
"""Output model containing redacted text and extracted PII."""
redacted_text: str = Field(
description="The original text with all PII replaced by [REDACTED]",
examples=[
"Hi, I'm [REDACTED]. You can reach me at [REDACTED] or call [REDACTED]. "
"My SSN is [REDACTED] and I live at [REDACTED].",
],
)
extracted_pii: list[PIIExtraction] = Field(
description="List of extracted PII items with their types and positions",
examples=[
[
{"text": "John Doe", "type": "NAME", "start_index": 8, "end_index": 16},
{"text": "john.doe@email.com", "type": "EMAIL", "start_index": 30, "end_index": 47},
{"text": "555-0123", "type": "PHONE", "start_index": 57, "end_index": 65},
],
],
)
@workflowai.agent(
id="pii-extractor",
model=Model.LLAMA_4_SCOUT_BASIC,
)
async def extract_pii(input_data: PIIInput) -> PIIOutput:
"""
Extract and redact Personal Identifiable Information (PII) from text.
Guidelines:
1. Identify all instances of PII in the input text
2. Categorize each PII instance into one of the defined types
3. Record the exact position (start and end indices) of each PII instance
4. Replace all PII in the text with [REDACTED]
5. Ensure no sensitive information is left unredacted
6. Be thorough but avoid over-redacting non-PII information
7. When in doubt about PII type, use the OTHER category
8. Maintain the original text structure and formatting
9. Handle overlapping PII appropriately (e.g., name within an email)
10. Consider context when identifying PII (e.g., distinguish between company and personal emails)
"""
...
async def main():
# Example 1: Basic PII extraction
print("\nExample 1: Basic PII")
print("-" * 50)
text = (
"Hello, my name is Sarah Johnson and my email is sarah.j@example.com. "
"You can reach me at (555) 123-4567 or visit my blog at blog.sarahj.net. "
"I was born on 03/15/1985."
)
result = await extract_pii.run(PIIInput(text=text))
print("\nOriginal text:")
print(text)
print("\nRedacted text:")
print(result.output.redacted_text)
print("\nExtracted PII:")
for pii in result.output.extracted_pii:
print(f"- {pii.type}: {pii.text} (positions {pii.start_index}-{pii.end_index})")
# Example 2: Complex PII with financial and address information
print("\n\nExample 2: Complex PII")
print("-" * 50)
text = (
"Customer: David Wilson\n"
"Card: 4532-9678-1234-5678\n"
"Address: 789 Oak Avenue, Apt 4B\n"
" Boston, MA 02108\n"
"License: MA12-345-678\n"
"SSN: 078-05-1120"
)
result = await extract_pii.run(PIIInput(text=text))
print("\nOriginal text:")
print(text)
print("\nRedacted text:")
print(result.output.redacted_text)
print("\nExtracted PII:")
for pii in result.output.extracted_pii:
print(f"- {pii.type}: {pii.text} (positions {pii.start_index}-{pii.end_index})")
if __name__ == "__main__":
asyncio.run(main())