Questions or feedback?

Thiel-Sen Regression#

This example demonstrates how transformation and measurement plugins can be combined to build a differentially private Theil-Sen-style regression estimator.

Note

If you actually want to use Theil-Sen in Python, don’t copy-and-paste this code. It is already implemented by the OpenDP library: see the opendp.extras.sklearn.linear_model API reference.

The high-level flow is:

  1. Build a user-defined transformation that pairs records and predicts response values at two cut points.

  2. Build a user-defined measurement that privately estimates the medians of those predictions.

  3. Postprocess the private medians into slope and intercept coefficients.

Enable plugin-related features:

>>> import opendp.prelude as dp
>>> import numpy as np
>>> from pathlib import Path
>>> dp.enable_features("contrib", "honest-but-curious")

enable_features("contrib", "honest-but-curious")

1. Pairwise Prediction Transformation#

Construct a transformation that pairs rows and predicts response values at two x-axis cut points.

>>> def pairwise_predict(data, x_cuts):
...     data = np.array(data, copy=True)[: len(data) // 2 * 2]
...     np.random.shuffle(data)
...     p1, p2 = np.array_split(data, 2)
...     dx, dy = (p2 - p1).T
...     x_bar, y_bar = (p1 + p2).T / 2
...     points = dy / dx * (x_cuts[None].T - x_bar) + y_bar
...     return points.T[dx != 0]
...
>>> def make_pairwise_predict(x_cuts, runs: int = 1):
...     return dp.t.make_user_transformation(
...         input_domain=dp.numpy.array2_domain(
...             num_columns=2, T=float
...         ),
...         input_metric=dp.symmetric_distance(),
...         output_domain=dp.numpy.array2_domain(
...             num_columns=2, T=float
...         ),
...         output_metric=dp.symmetric_distance(),
...         function=lambda x: np.vstack(
...             [
...                 pairwise_predict(x, x_cuts)
...                 for _ in range(runs)
...             ]
...         ),
...         stability_map=lambda d_in: d_in * runs,
...     )
...

pairwise_predict <- function(data, x_cuts) {
  data <- as.matrix(data)
  data <- data[seq_len(nrow(data) %/% 2L * 2L), , drop = FALSE]
  data <- data[sample(nrow(data)), , drop = FALSE]

  midpoint <- nrow(data) %/% 2L
  p1 <- data[seq_len(midpoint), , drop = FALSE]
  p2 <- data[midpoint + seq_len(midpoint), , drop = FALSE]

  dx <- p2[, 1L] - p1[, 1L]
  dy <- p2[, 2L] - p1[, 2L]
  x_bar <- (p1[, 1L] + p2[, 1L]) / 2L
  y_bar <- (p1[, 2L] + p2[, 2L]) / 2L

  predicted_points <- sapply(
    x_cuts,
    function(x_cut) dy / dx * (x_cut - x_bar) + y_bar
  )
  predicted_points[dx != 0L, , drop = FALSE]
}

make_pairwise_predict <- function(
  input_domain,
  input_metric,
  x_cuts,
  runs = 1L
) {
  make_user_transformation(
    input_domain = input_domain,
    input_metric = input_metric,
    output_domain = input_domain,
    output_metric = input_metric,
    function_ = function(x) {
      as.data.frame(
        do.call(
          rbind,
          replicate(runs, pairwise_predict(x, x_cuts), simplify = FALSE)
        )
      )
    },
    stability_map = function(d_in) d_in * runs
  )
}
then_pairwise_predict <- to_then(make_pairwise_predict)

The figure below shows the intuition behind the pairwise prediction step: each sampled pair induces a line, and the mechanism records the predicted y-values where those lines cross two fixed x-axis cut points.

Pairwise fits used in the Theil-Sen plugin example.
Python code to generate this figure
>>> def render_pairwise_visualization():
...     import matplotlib.pyplot as plt
...     pairwise_plot_path = Path(__file__).with_name(
...         "theil-sen-pairwise-fits.png"
...     )
...     pairwise_plot_x_cuts = np.array([-1.5, 1.5])
...     pairwise_plot_x = np.linspace(-2.5, 2.5, 12)
...     pairwise_plot_y = 2 * pairwise_plot_x + 1
...     pairwise_plot_data = np.column_stack(
...         [pairwise_plot_x, pairwise_plot_y]
...     )
...     p1, p2 = np.array_split(pairwise_plot_data, 2)
...     fig, ax = plt.subplots(figsize=(7, 4.5))
...     ax.scatter(
...         pairwise_plot_data[:, 0],
...         pairwise_plot_data[:, 1],
...         color="black",
...         s=20,
...     )
...     for point_1, point_2 in zip(p1, p2):
...         slope = (point_2[1] - point_1[1]) / (
...             point_2[0] - point_1[0]
...         )
...         intercept = point_1[1] - slope * point_1[0]
...         ax.plot(
...             pairwise_plot_x,
...             slope * pairwise_plot_x + intercept,
...             color="#7a9e9f",
...             alpha=0.45,
...         )
...         predicted = slope * pairwise_plot_x_cuts + intercept
...         ax.scatter(
...             pairwise_plot_x_cuts,
...             predicted,
...             color="#bc4b51",
...             s=28,
...             zorder=3,
...         )
...     for x_cut in pairwise_plot_x_cuts:
...         ax.axvline(
...             x_cut,
...             color="#2c4c7c",
...             linestyle="--",
...             linewidth=1,
...         )
...     ax.set_title("Pairwise Predictions at Fixed Cut Points")
...     ax.set_xlabel("x")
...     ax.set_ylabel("y")
...     fig.tight_layout()
...     fig.savefig(pairwise_plot_path, dpi=150)
...     plt.close(fig)
...

2. Private Median Subroutine#

Construct a measurement that privately estimates the median prediction at each cut point.

>>> def make_select_column(j):
...     return dp.t.make_user_transformation(
...         input_domain=dp.numpy.array2_domain(
...             num_columns=2, T=float
...         ),
...         input_metric=dp.symmetric_distance(),
...         output_domain=dp.vector_domain(
...             dp.atom_domain(T=float)
...         ),
...         output_metric=dp.symmetric_distance(),
...         function=lambda x: x[:, j],
...         stability_map=lambda d_in: d_in,
...     )
...
>>> def make_private_percentile_medians(
...     output_measure, y_bounds, scale
... ):
...     m_median = dp.m.then_private_quantile(
...         output_measure,
...         candidates=np.linspace(*y_bounds, 100),
...         alpha=0.5,
...         scale=scale,
...     )
...     return dp.c.make_composition(
...         [
...             make_select_column(0)
...             >> dp.t.then_drop_null()
...             >> m_median,
...             make_select_column(1)
...             >> dp.t.then_drop_null()
...             >> m_median,
...         ]
...     )
...

make_select_column <- function(input_domain, input_metric, j) {
  make_user_transformation(
    input_domain = input_domain,
    input_metric = input_metric,
    output_domain = vector_domain(atom_domain(.T = f64)),
    output_metric = symmetric_distance(),
    function_ = function(x) x[, j],
    stability_map = function(d_in) d_in
  )
}
then_select_column <- to_then(make_select_column)

make_private_medians <- function(
  input_domain,
  input_metric,
  output_measure,
  y_bounds,
  scale
) {
  candidates <- seq(y_bounds[[1L]], y_bounds[[2L]], length.out = 100L)
  column_space <- c(input_domain, input_metric)
  first_median <- column_space |>
    then_select_column(1L) |>
    then_drop_null() |>
    then_private_quantile(
      output_measure = output_measure,
      candidates = candidates,
      alpha = 0.5,
      scale = scale
    )
  second_median <- column_space |>
    then_select_column(2L) |>
    then_drop_null() |>
    then_private_quantile(
      output_measure = output_measure,
      candidates = candidates,
      alpha = 0.5,
      scale = scale
    )

  make_composition(c(first_median, second_median))
}
then_private_medians <- to_then(make_private_medians)

3. Assemble the Mechanism#

Combine the transformation, private medians, and postprocessing into a single measurement.

>>> def make_private_theil_sen(
...     output_measure, x_bounds, y_bounds, scale, runs=1
... ):
...     x_cuts = x_bounds[0] + (
...         x_bounds[1] - x_bounds[0]
...     ) * np.array([0.25, 0.75])
...     p_inv = np.linalg.inv(
...         np.vstack([x_cuts, np.ones_like(x_cuts)]).T
...     )
...     return (
...         make_pairwise_predict(x_cuts, runs)
...         >> make_private_percentile_medians(
...             output_measure, y_bounds, scale
...         )
...         >> (lambda ys: p_inv @ ys)
...     )
...
>>> x_bounds = (-3.0, 3.0)
>>> y_bounds = (-10.0, 10.0)
>>> meas = make_private_theil_sen(
...     dp.max_divergence(), x_bounds, y_bounds, scale=1.0
... )
>>> meas.map(1)
2.0

make_private_theil_sen <- function(
  input_domain,
  input_metric,
  output_measure,
  x_bounds,
  y_bounds,
  scale,
  runs = 1L
) {
  x_cuts <- x_bounds[[1L]] + (x_bounds[[2L]] - x_bounds[[1L]]) * c(0.25, 0.75)
  p_inv <- solve(cbind(x_cuts, 1L))

  c(input_domain, input_metric) |>
    then_pairwise_predict(x_cuts, runs) |>
    then_private_medians(output_measure, y_bounds, scale) |>
    then_postprocess(
      function(ys) as.vector(p_inv %*% as.numeric(unlist(ys))),
      .TO = "Vec<f64>"
    )
}
then_private_theil_sen <- to_then(make_private_theil_sen)

x_bounds <- c(-3.0, 3.0)
y_bounds <- c(-10.0, 10.0)
input_space <- c(
  user_domain(
    identifier = "DataFrame2Domain",
    member = function(x) is.data.frame(x) && ncol(x) == 2L
  ),
  symmetric_distance()
)
meas <- input_space |>
  then_private_theil_sen(max_divergence(), x_bounds, y_bounds, scale = 1.)
meas(d_in = 1L)

4. Run a Private Release#

Apply the measurement to synthetic data to obtain private regression coefficients.

>>> np.random.seed(1)
>>> def f(x):
...     return x * 2 + 1
...
>>> x = np.random.normal(size=100, loc=0, scale=1.0)
>>> y = f(x) + np.random.normal(size=100, loc=0, scale=0.5)
>>> slope, intercept = meas(np.stack([x, y], axis=1))
>>> round(float(slope), 6), round(
...     float(intercept), 6
... )  
(..., ...)

f <- function(x) x * 2L + 1L

x <- rnorm(100L)
y <- f(x) + rnorm(100L, sd = 0.5)
private_data <- data.frame(x = x, y = y)
meas(arg = private_data)

The original notebook also visualized the resulting private fit:

Private Theil-Sen fit from the plugin example.
Python code to generate this figure
>>> def render_private_fit_visualization():
...     import matplotlib.pyplot as plt
...     private_fit_plot_path = Path(__file__).with_name(
...         "theil-sen-private-fit.png"
...     )
...     fig, ax = plt.subplots(figsize=(7, 4.5))
...     ax.scatter(
...         x,
...         y,
...         color="black",
...         s=14,
...         alpha=0.7,
...         label="synthetic data",
...     )
...     plot_x = np.linspace(x.min(), x.max(), 200)
...     ax.plot(
...         plot_x,
...         f(plot_x),
...         color="#2c4c7c",
...         linewidth=2,
...         label="true fit",
...     )
...     ax.plot(
...         plot_x,
...         slope * plot_x + intercept,
...         color="#bc4b51",
...         linewidth=2,
...         label="private fit",
...     )
...     ax.set_title("Private Theil-Sen Regression Fit")
...     ax.set_xlabel("x")
...     ax.set_ylabel("y")
...     ax.legend()
...     fig.tight_layout()
...     fig.savefig(private_fit_plot_path, dpi=150)
...     plt.close(fig)
...