Skip to content

Commit

Permalink
Create error estimator for SUM (#528)
Browse files Browse the repository at this point in the history
  • Loading branch information
dvadym authored Oct 17, 2024
1 parent 4cf6a56 commit 3a7a0ff
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 27 deletions.
94 changes: 77 additions & 17 deletions pipeline_dp/dataset_histograms/histogram_error_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,25 @@
import bisect


class CountErrorEstimator:
class ErrorEstimator:
"""Estimator of the error from DP pipeline from DatasetHistograms.
The recommended way to create this object is to use create_error_estimator.
It works only for COUNT and PRIVACY_ID_COUNT.
Partition selection error is not implemented yet. Now only contribution
bounding and noise error are taken into consideration.
"""

def __init__(self, epsilon: float, delta: Optional[float],
metric: pipeline_dp.Metric, noise: pipeline_dp.NoiseKind,
l0_ratios_dropped: Sequence[Tuple[int, float]],
linf_ratios_dropped: Sequence[Tuple[int, float]],
partition_histogram: hist.Histogram):
def __init__(
self,
epsilon: float,
delta: Optional[float],
metric: pipeline_dp.Metric,
noise: pipeline_dp.NoiseKind,
l0_ratios_dropped: Sequence[Tuple[int, float]],
linf_ratios_dropped: Sequence[Tuple[int, float]],
partition_histogram: hist.Histogram,
):
self._base_std = self._get_stddev_for_dp_mechanism(
epsilon, delta, noise)
self._metric = metric
Expand Down Expand Up @@ -84,15 +88,16 @@ def estimate_rmse(self,
linf_bound: linf contribution bound, AKA for COUNT as
max_contributions_per_partition. This parameter is ignored for
PRIVACY_ID_COUNT
Returns:
the estimated error.
"""
if self._metric == pipeline_dp.Metrics.COUNT:
if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
if linf_bound is None:
raise ValueError("linf must be given for COUNT")
ratio_dropped_l0 = self.get_ratio_dropped_l0(l0_bound)
ratio_dropped_linf = 0
if self._metric == pipeline_dp.Metrics.COUNT:
if self._metric != pipeline_dp.Metrics.PRIVACY_ID_COUNT:
ratio_dropped_linf = self.get_ratio_dropped_linf(linf_bound)
ratio_dropped = 1 - (1 - ratio_dropped_l0) * (1 - ratio_dropped_linf)
stddev = self._get_stddev(l0_bound, linf_bound)
Expand Down Expand Up @@ -133,23 +138,29 @@ def _get_stddev(self,
return self._base_std * math.sqrt(l0_bound) * linf_bound


def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
delta: Optional[float], metric: pipeline_dp.Metric,
noise: pipeline_dp.NoiseKind) -> CountErrorEstimator:
def create_estimator_for_count_and_privacy_id_count(
histograms: hist.DatasetHistograms,
epsilon: float,
delta: Optional[float],
metric: pipeline_dp.Metric,
noise: pipeline_dp.NoiseKind,
) -> ErrorEstimator:
"""Creates histogram based error estimator for COUNT or PRIVACY_ID_COUNT.
Args:
histograms: dataset histograms.
epsilon: epsilon parameter of the DP mechanism for adding noise.
delta: delta parameter of the DP mechanism for adding noise (must be
None for Laplace noise).
delta: delta parameter of the DP mechanism for adding noise (must be None
for Laplace noise).
metric: DP aggregation, COUNT or PRIVACY_ID_COUNT.
noise: type of DP noise.
Returns:
Error estimator.
"""
if metric not in [
pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT
pipeline_dp.Metrics.COUNT,
pipeline_dp.Metrics.PRIVACY_ID_COUNT,
]:
raise ValueError(
f"Only COUNT and PRIVACY_ID_COUNT are supported, but metric={metric}"
Expand All @@ -162,8 +173,15 @@ def create_error_estimator(histograms: hist.DatasetHistograms, epsilon: float,
partition_histogram = histograms.count_per_partition_histogram
else:
partition_histogram = histograms.count_privacy_id_per_partition
return CountErrorEstimator(epsilon, delta, metric, noise, l0_ratios_dropped,
linf_ratios_dropped, partition_histogram)
return ErrorEstimator(
epsilon,
delta,
metric,
noise,
l0_ratios_dropped,
linf_ratios_dropped,
partition_histogram,
)


def _estimate_rmse_impl(ratio_dropped: float, std: float,
Expand All @@ -176,3 +194,45 @@ def _estimate_rmse_impl(ratio_dropped: float, std: float,
std**2)
sum_rmse += bin.count * rmse
return sum_rmse / num_partitions


def create_estimator_for_sum(histograms: hist.DatasetHistograms,
epsilon: float,
delta: Optional[float],
noise: pipeline_dp.NoiseKind,
sum_index: int = 0) -> ErrorEstimator:
"""Creates histogram based error estimator for SUM.
Args:
histograms: dataset histograms.
epsilon: epsilon parameter of the DP mechanism for adding noise.
delta: delta parameter of the DP mechanism for adding noise (must be None
for Laplace noise).
noise: type of DP noise.
sum_index: the index of the sum for the case of multi-aggregations.
Returns:
Error estimator.
"""
l0_ratios_dropped = hist.compute_ratio_dropped(
histograms.l0_contributions_histogram)
if isinstance(histograms.linf_sum_contributions_histogram, hist.Histogram):
# 1 sum
linf_sum_histograms = histograms.linf_sum_contributions_histogram
partition_histogram = histograms.sum_per_partition_histogram
else: # multiple SUM aggregations
linf_sum_histograms = histograms.linf_sum_contributions_histogram[
sum_index]
partition_histogram = histograms.sum_per_partition_histogram[sum_index]

linf_ratios_dropped = hist.compute_ratio_dropped(linf_sum_histograms)

return ErrorEstimator(
epsilon,
delta,
pipeline_dp.Metrics.SUM,
noise,
l0_ratios_dropped,
linf_ratios_dropped,
partition_histogram,
)
55 changes: 45 additions & 10 deletions tests/dataset_histograms/histogram_error_estimator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,25 @@ def _get_histograms(self) -> hist.DatasetHistograms:
computing_histograms.compute_dataset_histograms(
dataset, data_extractors, pipeline_dp.LocalBackend()))[0]

def _get_estimator(
def _get_estimator_for_count_and_privacy_id_count(
self,
metric: pipeline_dp.Metric,
noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
epsilon: float = 2**0.5 / 2,
delta: Optional[float] = None,
):
return histogram_error_estimator.create_error_estimator(
return histogram_error_estimator.create_estimator_for_count_and_privacy_id_count(
self._get_histograms(), epsilon, delta, metric, noise_kind)

def _get_estimator_for_sum(
self,
noise_kind: pipeline_dp.NoiseKind = pipeline_dp.NoiseKind.LAPLACE,
epsilon: float = 2**0.5 / 2,
delta: Optional[float] = None,
):
return histogram_error_estimator.create_estimator_for_sum(
self._get_histograms(), epsilon, delta, noise_kind)

@parameterized.named_parameters(
dict(testcase_name='count_gaussian',
metric=pipeline_dp.Metrics.COUNT,
Expand Down Expand Up @@ -90,25 +99,34 @@ def test_count_get_sigma(self, metric: pipeline_dp.Metric, epsilon: float,
delta: Optional[float],
noise_kind: pipeline_dp.NoiseKind, l0: float,
linf: float, expected: float):
estimator = self._get_estimator(metric=metric,
epsilon=epsilon,
delta=delta,
noise_kind=noise_kind)
estimator = self._get_estimator_for_count_and_privacy_id_count(
metric=metric, epsilon=epsilon, delta=delta, noise_kind=noise_kind)
self.assertAlmostEqual(estimator._get_stddev(l0, linf),
expected,
delta=1e-10)

def test_sum_not_supported(self):
with self.assertRaisesRegex(
ValueError, "Only COUNT and PRIVACY_ID_COUNT are supported"):
self._get_estimator(pipeline_dp.Metrics.SUM)
self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.SUM)

@parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
(9, 1 / 11), (10, 0), (20, 0))
# there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
# l0_bound=1, 9 are dropped (from 1 privacy unit).
def test_get_ratio_dropped_l0(self, l0_bound, expected):
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
estimator = self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.COUNT)
self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
expected)

@parameterized.parameters((0, 1), (1, 9 / 11), (2, 8 / 11), (3, 7 / 11),
(9, 1 / 11), (10, 0), (20, 0))
# there are 11 (privacy_id, partition) pairs (from 2 privacy units), when
# l0_bound=1, 9 are dropped (from 1 privacy unit).
def test_get_ratio_dropped_l0_for_sum(self, l0_bound, expected):
estimator = self._get_estimator_for_sum()
self.assertAlmostEqual(estimator.get_ratio_dropped_l0(l0_bound),
expected)

Expand All @@ -117,7 +135,19 @@ def test_get_ratio_dropped_l0(self, l0_bound, expected):
# there are 30 rows (from 2 privacy units), when linf_bound=1, 19 are
# dropped (from 1 privacy unit, which contributes 20 to 1 partition).
def test_get_ratio_dropped_linf(self, linf_bound, expected):
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
estimator = self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.COUNT)
self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
expected)

@parameterized.parameters((0, 1), (0.5, 0.89), (1, 0.78), (2, 0.76),
(40, 0))
# there 1 is contribution of 40 and 10 contribution of 1.
# total contribution = 1*40+10*1 = 50
# when linf_bound = 0.5, left after contribution bounding 11*0.5=5.5, i.e.
# dropped (50-5.5)/50 = 0.89
def test_get_ratio_dropped_linf_for_sum(self, linf_bound, expected):
estimator = self._get_estimator_for_sum()
self.assertAlmostEqual(estimator.get_ratio_dropped_linf(linf_bound),
expected)

Expand All @@ -138,10 +168,15 @@ def test_get_ratio_dropped_linf(self, linf_bound, expected):
# rmse2 = sqrt(21*total_ratio_dropped + noise_stddev**2) ~= 19.70177
# rmse = (9*rmse1+rmse2)/10.
def test_estimate_rmse_count(self, l0_bound, linf_bound, expected):
estimator = self._get_estimator(pipeline_dp.Metrics.COUNT)
estimator = self._get_estimator_for_count_and_privacy_id_count(
pipeline_dp.Metrics.COUNT)
self.assertAlmostEqual(estimator.estimate_rmse(l0_bound, linf_bound),
expected)

def test_estimate_rmse_sum(self):
estimator = self._get_estimator_for_sum()
self.assertAlmostEqual(estimator.estimate_rmse(1, 1), 5.93769917)


if __name__ == '__main__':
absltest.main()

0 comments on commit 3a7a0ff

Please sign in to comment.