Skip to content

Commit

Permalink
Fix t-closeness Computation (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
zinwang authored Jul 27, 2023
1 parent 0317523 commit 3b1ac60
Show file tree
Hide file tree
Showing 9 changed files with 153 additions and 62 deletions.
2 changes: 1 addition & 1 deletion PETWorks/arx.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def loadDataHierarchyNatively(
path: PathLike, delimiter: str
) -> Dict[str, np.chararray]:
return {
attributeName: pd.read_csv(hierarchyFile, sep=delimiter).to_numpy(
attributeName: pd.read_csv(hierarchyFile, sep=delimiter, header=None).to_numpy(
dtype=str
)
for attributeName, hierarchyFile in __findHierarchyFile(path)
Expand Down
1 change: 0 additions & 1 deletion PETWorks/report/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def evaluate(
t = 1 - max(
[
measureTCloseness(
originalDataFrame,
anonymizedDataFrame,
sensitive,
qiNames,
Expand Down
107 changes: 50 additions & 57 deletions PETWorks/tcloseness.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,97 +70,92 @@ def _computeHierarchicalDistance(
return float(np.sum(costArray))


def _computeEqualDistance(
dataDistribution: dict[str, float],
groupDistribution: dict[str, float],
) -> float:
extraList = [
float(groupDistribution.get(value, 0) - dataDistribution.get(value, 0))
for value in dataDistribution.keys()
]

distance = 0.0
for extra in extraList:
distance += fabs(extra)
distance /= 2

return distance


def _computeNumericalDistance(
dataDistribution: dict[str, float],
groupDistribution: dict[str, float],
originalSensitiveData: pd.Series,
sensitiveData: pd.Series,
) -> float:
originalSensitiveData = originalSensitiveData.sort_values(
ascending=True, key=lambda x: pd.to_numeric(x, errors="coerce")
sensitiveData = sensitiveData.sort_values(
ascending=True, key=lambda x: pd.to_numeric(x)
)
numRows = len(originalSensitiveData)

valueList = sorted(
[originalSensitiveData[index] for index in range(numRows)],
key=lambda x: pd.to_numeric(x),
)
valueList = sensitiveData.unique().tolist()

extraList = [
float(groupDistribution.get(value, 0) - dataDistribution.get(value, 0))
for value in valueList
]

distance = 0.0
for index in range(numRows):
sum = 0
for subIndex in range(index):
sum += extraList[subIndex]
sum = 0.0
for extra in extraList:
sum += extra
distance += fabs(sum)
distance /= numRows - 1
distance /= len(extraList) - 1

return distance


def _computeTCloseness(
originalData: pd.DataFrame,
def measureTCloseness(
anonymizedData: pd.DataFrame,
sensitiveAttributeName: str,
qiNames: list[str],
sensitiveHierarchy: np.chararray,
) -> float:
dataDistribution = dict(
originalData[sensitiveAttributeName].value_counts() * 0
+ 1 / originalData[sensitiveAttributeName].nunique()
anonymizedData[sensitiveAttributeName].value_counts()
/ len(anonymizedData)
)
anonymizedGroups = anonymizedData.groupby(qiNames)

maxHierarchicalDistance = float("-inf")
maxDistance = float("-inf")
for _, group in anonymizedGroups:
groupDistribution = dict(
group[sensitiveAttributeName].value_counts() * 0 + 1 / len(group)
group[sensitiveAttributeName].value_counts() / len(group)
)
if sensitiveHierarchy is not None:
isNumeral = False
try:
float(anonymizedData[sensitiveAttributeName].iloc[0])
isNumeral = True
except ValueError:
pass

if isNumeral:
distance = _computeNumericalDistance(
dataDistribution,
groupDistribution,
anonymizedData[sensitiveAttributeName],
)
elif sensitiveHierarchy is not None:
distance = _computeHierarchicalDistance(
dataDistribution, groupDistribution, sensitiveHierarchy
)
else:
distance = _computeNumericalDistance(
dataDistribution,
groupDistribution,
originalData[sensitiveAttributeName],
distance = _computeEqualDistance(
dataDistribution, groupDistribution
)

if distance > maxHierarchicalDistance:
maxHierarchicalDistance = distance

return maxHierarchicalDistance


def measureTCloseness(
originalData: pd.DataFrame,
anonymizedData: pd.DataFrame,
sensitiveAttributeName: str,
qiNames: list[str],
sensitiveHierarchy: np.chararray,
) -> float:
isNumerical = True
try:
float(sensitiveHierarchy[0, 0])
except ValueError:
isNumerical = False

if isNumerical:
return _computeTCloseness(
originalData, anonymizedData, sensitiveAttributeName, qiNames, None
)
if distance > maxDistance:
maxDistance = distance

return _computeTCloseness(
originalData,
anonymizedData,
sensitiveAttributeName,
qiNames,
sensitiveHierarchy,
)
return maxDistance


def _validateTCloseness(tFromData: float, tLimit: float) -> bool:
Expand All @@ -173,7 +168,6 @@ def PETValidation(
tLimit = float(tLimit)

dataHierarchy = loadDataHierarchyNatively(dataHierarchy, ";")
originalData = pd.read_csv(original, sep=";", skipinitialspace=True)
anonymizedData = pd.read_csv(anonymized, sep=";", skipinitialspace=True)

qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER)
Expand All @@ -183,11 +177,10 @@ def PETValidation(

tList = [
measureTCloseness(
originalData,
anonymizedData,
sensitiveAttribute,
qiNames,
dataHierarchy,
dataHierarchy.get(sensitiveAttribute, None),
)
for sensitiveAttribute in sensitiveAttributes
]
Expand Down
10 changes: 10 additions & 0 deletions data/disease.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ZIPCode;Age;Salary;Disease
4767*;<=40;3;gastric ulcer
4767*;<=40;4;pneumonia
4767*;<=40;5;stomach cancer
4790*;>40;6;gastritis
4790*;>40;11;flu
4790*;>40;8;bronchitis
4760*;<=40;7;bronchitis
4760*;<=40;9;gastritis
4760*;<=40;10;stomach cancer
2 changes: 2 additions & 0 deletions data/disease_hierarchy/disease_hierarchy_Age.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<=40;*
>40;*
6 changes: 6 additions & 0 deletions data/disease_hierarchy/disease_hierarchy_Disease.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
gastric ulcer;stomach disease;digestive system disease;respiratory&digestive disease
gastritis;stomach disease;digestive system disease;respiratory&digestive disease
flu;respiratory infection;vascular lung disease;respiratory&digestive disease
bronchitis;respiratory infection;vascular lung disease;respiratory&digestive disease
pneumonia;respiratory infection;vascular lung disease;respiratory&digestive disease
stomach cancer;stomach disease;digestive system disease;respiratory&digestive disease
3 changes: 3 additions & 0 deletions data/disease_hierarchy/disease_hierarchy_ZIPCode.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
4767*;476**;47***;4****;*****
4760*;476**;47***;4****;*****
4790*;479**;47***;4****;*****
11 changes: 11 additions & 0 deletions data/merit.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Name;Merit
E;1
E;4
E;2
U;3
G;3
G;4
G;3
G;1
R;4
R;3
73 changes: 70 additions & 3 deletions tests/test_tcloseness.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,76 @@
from PETWorks.attributetypes import QUASI_IDENTIFIER
from PETWorks.attributetypes import SENSITIVE_ATTRIBUTE
from PETWorks.tcloseness import PETAnonymization
from PETWorks.attributetypes import (
QUASI_IDENTIFIER,
SENSITIVE_ATTRIBUTE,
INSENSITIVE_ATTRIBUTE,
)
from PETWorks.tcloseness import PETAnonymization, measureTCloseness
from PETWorks.arx import loadDataHierarchyNatively, getAttributeNameByType
import pandas as pd


def testMeasureTClosenessNumeral():
anonymizedData = pd.read_csv(
"data/merit.csv", sep=";", skipinitialspace=True
)
attributeTypes = {"Name": QUASI_IDENTIFIER, "Merit": SENSITIVE_ATTRIBUTE}
qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER)
sensitiveAttribute = getAttributeNameByType(
attributeTypes, SENSITIVE_ATTRIBUTE
)[0]
assert (
measureTCloseness(anonymizedData, sensitiveAttribute, qiNames, None)
== 0.26666666666666666
)


def testMeasureTClosenessHierarchical():
anonymizedData = pd.read_csv(
"data/disease.csv", sep=";", skipinitialspace=True
)
dataHierarchy = loadDataHierarchyNatively("data/disease_hierarchy", ";")
attributeTypes = {
"ZIPCode": QUASI_IDENTIFIER,
"Age": QUASI_IDENTIFIER,
"Salary": INSENSITIVE_ATTRIBUTE,
"Disease": SENSITIVE_ATTRIBUTE,
}
qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER)
sensitiveAttribute = getAttributeNameByType(
attributeTypes, SENSITIVE_ATTRIBUTE
)[0]

assert (
measureTCloseness(
anonymizedData,
sensitiveAttribute,
qiNames,
dataHierarchy.get(sensitiveAttribute, None),
)
== 0.29629629850387573
)


def testMeasureTClosenessEqual():
anonymizedData = pd.read_csv(
"data/disease.csv", sep=";", skipinitialspace=True
)
attributeTypes = {
"ZIPCode": QUASI_IDENTIFIER,
"Age": QUASI_IDENTIFIER,
"Salary": INSENSITIVE_ATTRIBUTE,
"Disease": SENSITIVE_ATTRIBUTE,
}
qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER)
sensitiveAttribute = getAttributeNameByType(
attributeTypes, SENSITIVE_ATTRIBUTE
)[0]

assert (
measureTCloseness(anonymizedData, sensitiveAttribute, qiNames, None)
== 0.5555555555555556
)


def testPETAnonymizationOrderedTCloseness(DATASET_PATH_ADULT):
attributeTypes = {
"age": SENSITIVE_ATTRIBUTE,
Expand Down

0 comments on commit 3b1ac60

Please sign in to comment.