Questions or feedback?

Privately Selecting Grouping Columns#

This longer example demonstrates how plugins could be used on a real-world problem.

Imagine you want to group the rows in a private dataset before releasing aggregate statistics, but you yourself are not allowed to look at the private data, and you don’t know what columns to group by. You do know that there are three types of columns:

  • Columns that are too uniform: most rows have the same value.

  • Columns that are too diverse: most rows have unique values.

  • Columns that are just right

If you group by too many columns, then the number of individuals contributing to each combination of attributes will be small, resulting in most combinations being censored in the final release. On the other hand, you want granular statistics, so more grouping columns is appealing.

The example below demonstrates how to construct your own mechanism that chooses a set of grouping columns for you. It also makes use of library plugins (via a user-defined transformation and domain) and the Report Noisy Max Gumbel mechanism.

We’ll first write plugins for our transformation and measurement:

>>> import opendp.prelude as dp
>>> import pandas as pd
>>> import random
>>> from itertools import chain, combinations
>>> def make_grouping_cols_score(
...     candidates, min_bin_contributions
... ):
...     r"""Create a transformation that assesses the utility of each candidate in `candidates`."""
...     dp.assert_features("contrib")
...     def score(x: pd.DataFrame, c):
...         return (
...             (
...                 x.groupby(list(c)).size()
...                 >= min_bin_contributions
...             )
...             .sum()
...             .astype(float)
...         )
...     return dp.t.make_user_transformation(
...         input_domain=dp.user_domain(
...             "PandasDomain",
...             member=lambda x: isinstance(x, pd.DataFrame),
...         ),
...         input_metric=dp.symmetric_distance(),
...         output_domain=dp.vector_domain(
...             dp.atom_domain(T=float, nan=False)
...         ),
...         output_metric=dp.linf_distance(
...             T=float, monotonic=True
...         ),
...         function=lambda x: [
...             score(x, c) for c in candidates
...         ],
...         stability_map=lambda d_in: float(d_in),
...     )
...
>>> def make_select_grouping_cols(
...     candidates, min_bin_size, scale
... ):
...     """Create a measurement that selects a set of grouping columns from `candidates`."""
...     return (
...         make_grouping_cols_score(candidates, min_bin_size)
...         >> dp.m.then_noisy_max(dp.max_divergence(), scale)
...         >> (lambda idx: candidates[idx])
...     )
...

make_grouping_cols_score <- function(
  input_domain,
  input_metric,
  candidates,
  min_bin_contributions
) {
  score <- function(x, cols) {
    key <- if (length(cols) == 1L) {
      x[[cols]]
    } else {
      interaction(x[, cols, drop = FALSE], drop = TRUE, lex.order = TRUE)
    }
    sum(table(key) >= min_bin_contributions)
  }

  make_user_transformation(
    input_domain = input_domain,
    input_metric = input_metric,
    output_domain = vector_domain(atom_domain(.T = f64, nan = FALSE)),
    output_metric = linf_distance(.T = f64, monotonic = TRUE),
    function_ = function(x) {
      vapply(candidates, function(cols) score(x, cols), numeric(1L))
    },
    stability_map = function(d_in) as.numeric(d_in)
  )
}
then_grouping_cols_score <- to_then(make_grouping_cols_score)

make_select_grouping_cols <- function(
  input_domain,
  input_metric,
  candidates,
  min_bin_size,
  scale
) {
  c(input_domain, input_metric) |>
    then_grouping_cols_score(candidates, min_bin_size) |>
    then_noisy_max(max_divergence(), scale = scale) |>
    then_postprocess(function(idx) candidates[[idx + 1L]])
}
then_select_grouping_cols <- to_then(make_select_grouping_cols)

Next, use these functions to create a DP mechanism:

>>> row_count = 50
>>> col_count = 4
>>> private_data = pd.DataFrame(
...     {
...         **{
...             f"too_uniform_{n}": [
...                 random.randint(0, 1)
...                 for _ in range(row_count)
...             ]
...             for n in range(col_count)
...         },
...         **{
...             f"too_diverse_{n}": [
...                 random.randint(0, row_count)
...                 for _ in range(row_count)
...             ]
...             for n in range(col_count)
...         },
...         **{
...             f"just_right_{n}": [
...                 random.randint(0, 20)
...                 for _ in range(row_count)
...             ]
...             for n in range(col_count)
...         },
...     }
... )
>>> def powerset(iterable):
...     s = list(iterable)
...     return chain.from_iterable(
...         combinations(s, r) for r in range(1, len(s) + 1)
...     )
...
>>> candidates = list(powerset(private_data.columns))
>>> dp.enable_features("honest-but-curious", "contrib")
>>> m_select_gcols = make_select_grouping_cols(
...     candidates=candidates,
...     min_bin_size=89,
...     scale=10.0,
... )
>>> print("ε =", m_select_gcols.map(d_in=1))
ε = 0.1

row_count <- 50L
col_count <- 4L
private_data <- data.frame(
  setNames(
    replicate(
      col_count, sample(0L:1L, row_count, replace = TRUE), simplify = FALSE
    ),
    paste0("too_uniform_", seq_len(col_count) - 1L)
  ),
  setNames(
    replicate(
      col_count,
      sample(0L:row_count, row_count, replace = TRUE),
      simplify = FALSE
    ),
    paste0("too_diverse_", seq_len(col_count) - 1L)
  ),
  setNames(
    replicate(
      col_count,
      sample(0L:20L, row_count, replace = TRUE),
      simplify = FALSE
    ),
    paste0("just_right_", seq_len(col_count) - 1L)
  ),
  check.names = FALSE
)

powerset <- function(values) {
  unlist(
    lapply(seq_along(values), function(k) combn(values, k, simplify = FALSE)),
    recursive = FALSE
  )
}

candidates <- powerset(colnames(private_data))

input_space <- c(
  user_domain(
    identifier = "DataFrameDomain",
    member = function(x) is.data.frame(x)
  ),
  symmetric_distance()
)
m_select_gcols <- input_space |>
  then_select_grouping_cols(
    candidates = candidates,
    min_bin_size = 89L,
    scale = 10.
  )

m_select_gcols(d_in = 1L)

Finally, load your data and make a DP release:

>>> dp_selected_grouping_columns = m_select_gcols(private_data)
>>> dp_selected_grouping_columns  
(...)

dp_selected_grouping_columns <- m_select_gcols(arg = private_data)
dp_selected_grouping_columns

Successive runs will return different sets of columns that satisfy your criteria.