Skip to content

Commit 2e2be78

Browse files
authored
Merge pull request #198 from edenai/EAISCRUM-1458-privateai-text-anonymization-pii
[Feat] PrivateAI text PII
2 parents e0dd662 + 11e18c8 commit 2e2be78

File tree

5 files changed

+204
-10
lines changed

5 files changed

+204
-10
lines changed

edenai_apis/apis/privateai/info.json

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,66 @@
3535
},
3636
"version": "v3"
3737
}
38+
},
39+
"text": {
40+
"anonymization": {
41+
"constraints": {
42+
"languages": [
43+
"en",
44+
"nl",
45+
"fr",
46+
"de",
47+
"hi",
48+
"it",
49+
"ja",
50+
"ko",
51+
"pt",
52+
"zh-Hans",
53+
"ru",
54+
"es",
55+
"tl",
56+
"uk",
57+
"af",
58+
"ar",
59+
"bm",
60+
"bn",
61+
"be",
62+
"bg",
63+
"my",
64+
"zh-Hant",
65+
"ca",
66+
"hr",
67+
"cs",
68+
"da",
69+
"et",
70+
"fi",
71+
"ka",
72+
"el",
73+
"he",
74+
"hu",
75+
"is",
76+
"id",
77+
"km",
78+
"lv",
79+
"lt",
80+
"lb",
81+
"ms",
82+
"ro",
83+
"nb",
84+
"fa",
85+
"pl",
86+
"pa",
87+
"sk",
88+
"sl",
89+
"sw",
90+
"sv",
91+
"ta",
92+
"th",
93+
"tr",
94+
"vi"
95+
]
96+
},
97+
"version": "v3"
98+
}
3899
}
39100
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
{
2+
"original_response": [
3+
{
4+
"processed_text": "The phone number of [NAME_GIVEN_1] is the [PHONE_NUMBER_1].",
5+
"entities": [
6+
{
7+
"processed_text": "NAME_GIVEN_1",
8+
"text": "Luc",
9+
"location": {
10+
"stt_idx": 20,
11+
"end_idx": 23,
12+
"stt_idx_processed": 20,
13+
"end_idx_processed": 34
14+
},
15+
"best_label": "NAME_GIVEN",
16+
"labels": {
17+
"NAME_GIVEN": 0.9117,
18+
"NAME": 0.9053
19+
}
20+
},
21+
{
22+
"processed_text": "PHONE_NUMBER_1",
23+
"text": "06 21 32 43 54",
24+
"location": {
25+
"stt_idx": 31,
26+
"end_idx": 45,
27+
"stt_idx_processed": 42,
28+
"end_idx_processed": 58
29+
},
30+
"best_label": "PHONE_NUMBER",
31+
"labels": {
32+
"PHONE_NUMBER": 0.9037
33+
}
34+
}
35+
],
36+
"entities_present": true,
37+
"characters_processed": 46,
38+
"languages_detected": {
39+
"en": 0.7309150695800781
40+
}
41+
}
42+
],
43+
"standardized_response": {
44+
"result": "The phone number of *** is the **************.",
45+
"entities": [
46+
{
47+
"offset": 20,
48+
"length": 3,
49+
"category": "PersonalInformation",
50+
"subcategory": null,
51+
"original_label": "NAME_GIVEN",
52+
"content": "Luc",
53+
"confidence_score": 0.912
54+
},
55+
{
56+
"offset": 31,
57+
"length": 14,
58+
"category": "PersonalInformation",
59+
"subcategory": null,
60+
"original_label": "PHONE_NUMBER",
61+
"content": "06 21 32 43 54",
62+
"confidence_score": 0.904
63+
}
64+
]
65+
}
66+
}

edenai_apis/apis/privateai/privateai_api.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,20 @@
22
import json
33
import mimetypes
44
import uuid
5-
from typing import Dict
5+
from typing import Dict, List
66
from io import BytesIO
77

88
import requests
99

10-
from edenai_apis.features import ProviderInterface, OcrInterface
10+
from edenai_apis.features import ProviderInterface, OcrInterface, TextInterface
1111
from edenai_apis.features.ocr.anonymization_async.anonymization_async_dataclass import (
1212
AnonymizationAsyncDataClass,
1313
)
14+
from edenai_apis.features.text.anonymization.anonymization_dataclass import (
15+
AnonymizationDataClass,
16+
AnonymizationEntity,
17+
)
18+
from edenai_apis.features.text.anonymization.category import CategoryType
1419
from edenai_apis.loaders.data_loader import ProviderDataEnum
1520
from edenai_apis.loaders.loaders import load_provider
1621
from apis.amazon.helpers import check_webhook_result
@@ -20,11 +25,13 @@
2025
AsyncLaunchJobResponseType,
2126
AsyncPendingResponseType,
2227
AsyncResponseType,
28+
ResponseType,
2329
)
2430
from edenai_apis.utils.upload_s3 import upload_file_bytes_to_s3, USER_PROCESS
31+
from edenai_apis.utils.parsing import extract
2532

2633

27-
class PrivateaiApi(ProviderInterface, OcrInterface):
34+
class PrivateaiApi(ProviderInterface, OcrInterface, TextInterface):
2835
provider_name = "privateai"
2936

3037
def __init__(self, api_keys: Dict = {}) -> None:
@@ -131,3 +138,52 @@ def ocr__anonymization_async__get_job_result(
131138
),
132139
provider_job_id=provider_job_id,
133140
)
141+
142+
def text__anonymization(
143+
self, text: str, language: str
144+
) -> ResponseType[AnonymizationDataClass]:
145+
payload = {
146+
"text": [text],
147+
"entity_detection": {"accuracy": "high", "return_entity": True},
148+
}
149+
response = requests.post(
150+
url=self.url + "v3/process/text", json=payload, headers=self.headers
151+
)
152+
153+
if response.status_code != 200:
154+
raise ProviderException(message=response.text, code=response.status_code)
155+
156+
try:
157+
original_response = response.json()
158+
except json.JSONDecodeError as exc:
159+
raise ProviderException(message="Internal server error", code=500) from exc
160+
161+
entities: List[AnonymizationEntity] = []
162+
new_text = text
163+
for entity in original_response[0].get("entities", []):
164+
start_index = extract(entity, ["location", "stt_idx"])
165+
end_index = extract(entity, ["location", "end_idx"])
166+
original_label = entity.get("best_label")
167+
confidence = extract(entity, ["labels", original_label])
168+
classification = CategoryType.choose_category_subcategory(original_label)
169+
replacement = "*" * (end_index - start_index)
170+
new_text = new_text[:start_index] + replacement + new_text[end_index:]
171+
entities.append(
172+
AnonymizationEntity(
173+
offset=start_index,
174+
length=end_index - start_index,
175+
category=classification["category"],
176+
subcategory=classification["subcategory"],
177+
original_label=original_label,
178+
content=entity.get("text"),
179+
confidence_score=confidence,
180+
)
181+
)
182+
183+
standardized_response = AnonymizationDataClass(
184+
result=new_text, entities=entities
185+
)
186+
return ResponseType[AnonymizationDataClass](
187+
original_response=original_response,
188+
standardized_response=standardized_response,
189+
)

edenai_apis/features/text/anonymization/pattern.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ class PersonnalInformation:
2323
PERSON_TYPE (list): List of all the patterns for the person type subcategory.
2424
"""
2525

26-
NAME = ["name", "username", "person", "human"]
26+
NAME = ["name", "username", "person", "human", "name_given", "name_family"]
2727
AGE = ["age"]
28-
EMAIL = ["email"]
29-
PHONE = ["phone", "phonenumber", "phone number"]
28+
EMAIL = ["email", "email_address"]
29+
PHONE = ["phone", "phonenumber", "phone number", "phone_number"]
3030
PERSON_TYPE = ["persontype"]
31+
GENDER = ["gender_sexuality"]
3132

3233
class FinancialInformation:
3334
"""This class contains all the patterns for the subcategories of the anonymization category.
@@ -52,6 +53,7 @@ class FinancialInformation:
5253
"bank account number",
5354
"bank_account_number",
5455
"international_bank_account_number",
56+
"account_number",
5557
]
5658
BANK_ROUTING_NUMBER = ["bank_routing", "aba_routing_number", "iban"]
5759
SWIFT_CODE = ["swift code", "swift_code"]
@@ -172,10 +174,10 @@ class DateAndTime:
172174
DATE (list): List of all the patterns for the date subcategory.
173175
"""
174176

175-
DURATION = ["duration", "timeduration"]
177+
DURATION = ["duration", "timeduration", "date_interval"]
176178
DATE_TIME = ["date_time", "datetime"]
177179
TIME = ["time"]
178-
DATE = ["date"]
180+
DATE = ["date", "dob"]
179181

180182
class LocationInformation:
181183
"""This class contains all the patterns for the subcategories of the anonymization category.
@@ -185,8 +187,15 @@ class LocationInformation:
185187
ADDRESS (list): List of all the patterns for the address subcategory.
186188
"""
187189

188-
ADDRESS = ["address"]
189-
LOCATION = ["location"]
190+
ADDRESS = ["address", "LOCATION_ADDRESS"]
191+
LOCATION = [
192+
"location",
193+
"LOCATION_CITY",
194+
"LOCATION_COORDINATE",
195+
"LOCATION_COUNTRY",
196+
"LOCATION_STATE",
197+
"LOCATION_ZIP",
198+
]
190199

191200
class Other:
192201
"""This class contains all the patterns for the subcategories of the anonymization category.

edenai_apis/features/text/anonymization/subcategory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class PersonalInformationSubCategoryType(SubCategoryBase, Enum):
6969
Email = "Email"
7070
Phone = "Phone"
7171
PersonType = "PersonType"
72+
Gender = "Gender"
7273

7374
@classmethod
7475
def list_choices(cls) -> Dict["SubCategoryBase", List[str]]:
@@ -78,6 +79,7 @@ def list_choices(cls) -> Dict["SubCategoryBase", List[str]]:
7879
cls.Email: SubCategoryPattern.PersonnalInformation.EMAIL,
7980
cls.Phone: SubCategoryPattern.PersonnalInformation.PHONE,
8081
cls.PersonType: SubCategoryPattern.PersonnalInformation.PERSON_TYPE,
82+
cls.Gender: SubCategoryPattern.PersonnalInformation.GENDER,
8183
}
8284

8385

0 commit comments

Comments
 (0)