From 3b1ac60b22aeb3a835894efd72aea3144f469e8d Mon Sep 17 00:00:00 2001 From: zinwang <32264884+zinwang@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:25:18 +0800 Subject: [PATCH] Fix t-closeness Computation (#30) --- PETWorks/arx.py | 2 +- PETWorks/report/evaluator.py | 1 - PETWorks/tcloseness.py | 107 ++++++++---------- data/disease.csv | 10 ++ .../disease_hierarchy_Age.csv | 2 + .../disease_hierarchy_Disease.csv | 6 + .../disease_hierarchy_ZIPCode.csv | 3 + data/merit.csv | 11 ++ tests/test_tcloseness.py | 73 +++++++++++- 9 files changed, 153 insertions(+), 62 deletions(-) create mode 100644 data/disease.csv create mode 100644 data/disease_hierarchy/disease_hierarchy_Age.csv create mode 100644 data/disease_hierarchy/disease_hierarchy_Disease.csv create mode 100644 data/disease_hierarchy/disease_hierarchy_ZIPCode.csv create mode 100644 data/merit.csv diff --git a/PETWorks/arx.py b/PETWorks/arx.py index 9809bec..a56aea1 100644 --- a/PETWorks/arx.py +++ b/PETWorks/arx.py @@ -141,7 +141,7 @@ def loadDataHierarchyNatively( path: PathLike, delimiter: str ) -> Dict[str, np.chararray]: return { - attributeName: pd.read_csv(hierarchyFile, sep=delimiter).to_numpy( + attributeName: pd.read_csv(hierarchyFile, sep=delimiter, header=None).to_numpy( dtype=str ) for attributeName, hierarchyFile in __findHierarchyFile(path) diff --git a/PETWorks/report/evaluator.py b/PETWorks/report/evaluator.py index 40971b6..e439047 100644 --- a/PETWorks/report/evaluator.py +++ b/PETWorks/report/evaluator.py @@ -113,7 +113,6 @@ def evaluate( t = 1 - max( [ measureTCloseness( - originalDataFrame, anonymizedDataFrame, sensitive, qiNames, diff --git a/PETWorks/tcloseness.py b/PETWorks/tcloseness.py index 8f378e0..73fefb0 100644 --- a/PETWorks/tcloseness.py +++ b/PETWorks/tcloseness.py @@ -70,20 +70,33 @@ def _computeHierarchicalDistance( return float(np.sum(costArray)) +def _computeEqualDistance( + dataDistribution: dict[str, float], + groupDistribution: dict[str, float], +) -> float: + extraList = [ + float(groupDistribution.get(value, 0) - dataDistribution.get(value, 0)) + for value in dataDistribution.keys() + ] + + distance = 0.0 + for extra in extraList: + distance += fabs(extra) + distance /= 2 + + return distance + + def _computeNumericalDistance( dataDistribution: dict[str, float], groupDistribution: dict[str, float], - originalSensitiveData: pd.Series, + sensitiveData: pd.Series, ) -> float: - originalSensitiveData = originalSensitiveData.sort_values( - ascending=True, key=lambda x: pd.to_numeric(x, errors="coerce") + sensitiveData = sensitiveData.sort_values( + ascending=True, key=lambda x: pd.to_numeric(x) ) - numRows = len(originalSensitiveData) - valueList = sorted( - [originalSensitiveData[index] for index in range(numRows)], - key=lambda x: pd.to_numeric(x), - ) + valueList = sensitiveData.unique().tolist() extraList = [ float(groupDistribution.get(value, 0) - dataDistribution.get(value, 0)) @@ -91,76 +104,58 @@ def _computeNumericalDistance( ] distance = 0.0 - for index in range(numRows): - sum = 0 - for subIndex in range(index): - sum += extraList[subIndex] + sum = 0.0 + for extra in extraList: + sum += extra distance += fabs(sum) - distance /= numRows - 1 + distance /= len(extraList) - 1 return distance -def _computeTCloseness( - originalData: pd.DataFrame, +def measureTCloseness( anonymizedData: pd.DataFrame, sensitiveAttributeName: str, qiNames: list[str], sensitiveHierarchy: np.chararray, ) -> float: dataDistribution = dict( - originalData[sensitiveAttributeName].value_counts() * 0 - + 1 / originalData[sensitiveAttributeName].nunique() + anonymizedData[sensitiveAttributeName].value_counts() + / len(anonymizedData) ) anonymizedGroups = anonymizedData.groupby(qiNames) - maxHierarchicalDistance = float("-inf") + maxDistance = float("-inf") for _, group in anonymizedGroups: groupDistribution = dict( - group[sensitiveAttributeName].value_counts() * 0 + 1 / len(group) + group[sensitiveAttributeName].value_counts() / len(group) ) - if sensitiveHierarchy is not None: + isNumeral = False + try: + float(anonymizedData[sensitiveAttributeName].iloc[0]) + isNumeral = True + except ValueError: + pass + + if isNumeral: + distance = _computeNumericalDistance( + dataDistribution, + groupDistribution, + anonymizedData[sensitiveAttributeName], + ) + elif sensitiveHierarchy is not None: distance = _computeHierarchicalDistance( dataDistribution, groupDistribution, sensitiveHierarchy ) else: - distance = _computeNumericalDistance( - dataDistribution, - groupDistribution, - originalData[sensitiveAttributeName], + distance = _computeEqualDistance( + dataDistribution, groupDistribution ) - if distance > maxHierarchicalDistance: - maxHierarchicalDistance = distance - - return maxHierarchicalDistance - - -def measureTCloseness( - originalData: pd.DataFrame, - anonymizedData: pd.DataFrame, - sensitiveAttributeName: str, - qiNames: list[str], - sensitiveHierarchy: np.chararray, -) -> float: - isNumerical = True - try: - float(sensitiveHierarchy[0, 0]) - except ValueError: - isNumerical = False - - if isNumerical: - return _computeTCloseness( - originalData, anonymizedData, sensitiveAttributeName, qiNames, None - ) + if distance > maxDistance: + maxDistance = distance - return _computeTCloseness( - originalData, - anonymizedData, - sensitiveAttributeName, - qiNames, - sensitiveHierarchy, - ) + return maxDistance def _validateTCloseness(tFromData: float, tLimit: float) -> bool: @@ -173,7 +168,6 @@ def PETValidation( tLimit = float(tLimit) dataHierarchy = loadDataHierarchyNatively(dataHierarchy, ";") - originalData = pd.read_csv(original, sep=";", skipinitialspace=True) anonymizedData = pd.read_csv(anonymized, sep=";", skipinitialspace=True) qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER) @@ -183,11 +177,10 @@ def PETValidation( tList = [ measureTCloseness( - originalData, anonymizedData, sensitiveAttribute, qiNames, - dataHierarchy, + dataHierarchy.get(sensitiveAttribute, None), ) for sensitiveAttribute in sensitiveAttributes ] diff --git a/data/disease.csv b/data/disease.csv new file mode 100644 index 0000000..5c332c1 --- /dev/null +++ b/data/disease.csv @@ -0,0 +1,10 @@ +ZIPCode;Age;Salary;Disease +4767*;<=40;3;gastric ulcer +4767*;<=40;4;pneumonia +4767*;<=40;5;stomach cancer +4790*;>40;6;gastritis +4790*;>40;11;flu +4790*;>40;8;bronchitis +4760*;<=40;7;bronchitis +4760*;<=40;9;gastritis +4760*;<=40;10;stomach cancer \ No newline at end of file diff --git a/data/disease_hierarchy/disease_hierarchy_Age.csv b/data/disease_hierarchy/disease_hierarchy_Age.csv new file mode 100644 index 0000000..115072b --- /dev/null +++ b/data/disease_hierarchy/disease_hierarchy_Age.csv @@ -0,0 +1,2 @@ +<=40;* +>40;* \ No newline at end of file diff --git a/data/disease_hierarchy/disease_hierarchy_Disease.csv b/data/disease_hierarchy/disease_hierarchy_Disease.csv new file mode 100644 index 0000000..e0dcfa9 --- /dev/null +++ b/data/disease_hierarchy/disease_hierarchy_Disease.csv @@ -0,0 +1,6 @@ +gastric ulcer;stomach disease;digestive system disease;respiratory&digestive disease +gastritis;stomach disease;digestive system disease;respiratory&digestive disease +flu;respiratory infection;vascular lung disease;respiratory&digestive disease +bronchitis;respiratory infection;vascular lung disease;respiratory&digestive disease +pneumonia;respiratory infection;vascular lung disease;respiratory&digestive disease +stomach cancer;stomach disease;digestive system disease;respiratory&digestive disease \ No newline at end of file diff --git a/data/disease_hierarchy/disease_hierarchy_ZIPCode.csv b/data/disease_hierarchy/disease_hierarchy_ZIPCode.csv new file mode 100644 index 0000000..c568f83 --- /dev/null +++ b/data/disease_hierarchy/disease_hierarchy_ZIPCode.csv @@ -0,0 +1,3 @@ +4767*;476**;47***;4****;***** +4760*;476**;47***;4****;***** +4790*;479**;47***;4****;***** \ No newline at end of file diff --git a/data/merit.csv b/data/merit.csv new file mode 100644 index 0000000..9527298 --- /dev/null +++ b/data/merit.csv @@ -0,0 +1,11 @@ +Name;Merit +E;1 +E;4 +E;2 +U;3 +G;3 +G;4 +G;3 +G;1 +R;4 +R;3 \ No newline at end of file diff --git a/tests/test_tcloseness.py b/tests/test_tcloseness.py index 757c9ab..6c33f4e 100644 --- a/tests/test_tcloseness.py +++ b/tests/test_tcloseness.py @@ -1,9 +1,76 @@ -from PETWorks.attributetypes import QUASI_IDENTIFIER -from PETWorks.attributetypes import SENSITIVE_ATTRIBUTE -from PETWorks.tcloseness import PETAnonymization +from PETWorks.attributetypes import ( + QUASI_IDENTIFIER, + SENSITIVE_ATTRIBUTE, + INSENSITIVE_ATTRIBUTE, +) +from PETWorks.tcloseness import PETAnonymization, measureTCloseness +from PETWorks.arx import loadDataHierarchyNatively, getAttributeNameByType import pandas as pd +def testMeasureTClosenessNumeral(): + anonymizedData = pd.read_csv( + "data/merit.csv", sep=";", skipinitialspace=True + ) + attributeTypes = {"Name": QUASI_IDENTIFIER, "Merit": SENSITIVE_ATTRIBUTE} + qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER) + sensitiveAttribute = getAttributeNameByType( + attributeTypes, SENSITIVE_ATTRIBUTE + )[0] + assert ( + measureTCloseness(anonymizedData, sensitiveAttribute, qiNames, None) + == 0.26666666666666666 + ) + + +def testMeasureTClosenessHierarchical(): + anonymizedData = pd.read_csv( + "data/disease.csv", sep=";", skipinitialspace=True + ) + dataHierarchy = loadDataHierarchyNatively("data/disease_hierarchy", ";") + attributeTypes = { + "ZIPCode": QUASI_IDENTIFIER, + "Age": QUASI_IDENTIFIER, + "Salary": INSENSITIVE_ATTRIBUTE, + "Disease": SENSITIVE_ATTRIBUTE, + } + qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER) + sensitiveAttribute = getAttributeNameByType( + attributeTypes, SENSITIVE_ATTRIBUTE + )[0] + + assert ( + measureTCloseness( + anonymizedData, + sensitiveAttribute, + qiNames, + dataHierarchy.get(sensitiveAttribute, None), + ) + == 0.29629629850387573 + ) + + +def testMeasureTClosenessEqual(): + anonymizedData = pd.read_csv( + "data/disease.csv", sep=";", skipinitialspace=True + ) + attributeTypes = { + "ZIPCode": QUASI_IDENTIFIER, + "Age": QUASI_IDENTIFIER, + "Salary": INSENSITIVE_ATTRIBUTE, + "Disease": SENSITIVE_ATTRIBUTE, + } + qiNames = getAttributeNameByType(attributeTypes, QUASI_IDENTIFIER) + sensitiveAttribute = getAttributeNameByType( + attributeTypes, SENSITIVE_ATTRIBUTE + )[0] + + assert ( + measureTCloseness(anonymizedData, sensitiveAttribute, qiNames, None) + == 0.5555555555555556 + ) + + def testPETAnonymizationOrderedTCloseness(DATASET_PATH_ADULT): attributeTypes = { "age": SENSITIVE_ATTRIBUTE,