Source code for opendp.extras.numpy.canonical

"""
This module requires extra installs: ``pip install 'opendp[numpy]'``

For convenience, all the members of this module are also available from :py:mod:`opendp.prelude`.
We suggest importing under the conventional name ``dp``:

.. code:: pycon

    >>> import opendp.prelude as dp

The members of this module will then be accessible at ``dp.numpy.canonical``.
"""

from __future__ import annotations

import math
from typing import Literal
from opendp._lib import import_optional_dependency
from opendp.mod import binary_search



[docs]
class BinomialCND:
    """
    Utilities to conduct statistical inference on the output of the canonical noise mechanism on binomially-distributed data.

    Use :func:`opendp.measurements.make_canonical_noise` to instantiate the canonical noise mechanism.

    The mechanism outputs a sample from X + N,
    where X ~ Binomial(n=size, p=theta), N ~ CND(0, d_in, d_out).

    A counting query is a realization of X ~ Binomial(n=size, p=theta) when,
    for each row in data with ``size`` rows, a counter is incremented with probability theta.

    :param estimate: the differentially private outcome of the canonical noise mechanism on binomial data
    :param d_in: sensitivity of the input to the mechanism
    :param d_out: privacy parameters (ε, δ)
    :param size: (approximate) total number of records in the sensitive dataset
    """

    def __init__(
        self, estimate: float, d_in: float, d_out: tuple[float, float], size: int
    ) -> None:
        self.estimate = estimate
        self.d_in = d_in
        self.d_out = d_out
        self.size = size


[docs]
    def confidence_interval(
        self, alpha: float, side: Literal["lower", "upper"] | None = None
    ) -> float | tuple[float, float]:
        """Compute a confidence interval for ``self.estimate``.

        * side=None: return the two bounds of a confidence interval
        * side="lower": return a one-sided confidence interval lower bound
        * side="upper": return a one-sided confidence interval upper bound

        :param alpha: statistical significance level
        :param side: configure interval type
        :return: confidence interval bound(s)
        """
        if side is None:
            return two_sided_confidence_interval(
                self.estimate, self.d_in, self.d_out, self.size, alpha
            )

        if side in {"lower", "upper"}:
            return one_sided_confidence_interval(
                self.estimate, self.d_in, self.d_out, self.size, alpha, side
            )

        raise ValueError(f"tail must be None, 'lower' or 'upper', not '{side}'")  # pragma: no cover



[docs]
    def p_value(
        self, theta: float, tail: Literal["left", "right"] | None = None
    ) -> float:
        """
        Computes a p-value based on ``self.estimate``.

        * tail=None: null hypothesis states that true proportion is theta
        * tail="left": null hypothesis states that true proportion is no less than theta
        * tail="right": null hypothesis states that true proportion is no greater than theta

        :param theta: success rate
        :param tail: configure the null hypothesis
        :return: the probability of observing ``self.estimate``, given the null hypothesis
        """
        if tail is None:
            return two_sided_pvalue(
                self.estimate, self.d_in, self.d_out, self.size, theta
            )

        if tail in {"left", "right"}:
            return one_sided_pvalue(
                self.estimate, self.d_in, self.d_out, self.size, theta, tail
            )

        raise ValueError(f"tail must be None, 'left' or 'right', not '{tail}'")  # pragma: no cover


    def __repr__(self) -> str:
        return f"BinomialCND(estimate={self.estimate}, d_in={self.d_in}, d_out={self.d_out}, size={self.size})"



# If the canonical noise mechanism is generalized beyond (ε, δ)-DP, 
# then this needs to be updated, as it only considers the Tulap noise generated from this definition.
def _cnd_cdf(t, shift, d_in: float, d_out: tuple[float, float]):
    """Computes the value of the cumulative density function Pr[T <= t] where T ~ Tulap(shift, b, q),
    where b and q are calibrated to satisfy d_out-DP when sensitivity is d_in"""
    np = import_optional_dependency("numpy")
    epsilon, delta = d_out

    t = (np.atleast_1d(t) - shift) / d_in
    b = math.exp(-epsilon)
    q = (2 * delta * b) / (1 - b + 2 * delta * b)
    lcut = q / 2
    rcut = q / 2
    r = np.rint(t)
    g = -math.log(b)
    l = math.log(1 + b)  # noqa
    k = 1 - b
    
    with np.errstate(over="ignore"):
        negs = np.exp((r * g) - l + np.log(b + ((t - r + (1 / 2)) * k)))
        poss = 1 - np.exp((r * -g) - l + np.log(b + ((r - t + (1 / 2)) * k)))

    # check for infinities
    negs[np.isinf(negs)] = 0
    poss[np.isinf(poss)] = 0
    # truncate w.r.t. the indicator on t's positivity
    is_leq0 = np.less_equal(t, 0).astype(int)
    trunc = (is_leq0 * negs) + ((1 - is_leq0) * poss)

    # handle the cut adjustment and scaling
    q = lcut + rcut
    is_mid = np.logical_and(
        np.less_equal(lcut, trunc), np.less_equal(trunc, 1 - rcut)
    ).astype(int)
    is_rhs = np.less(1 - rcut, trunc).astype(int)
    return ((trunc - lcut) / (1 - q)) * is_mid + is_rhs



[docs]
def two_sided_confidence_interval(
    Z: float, d_in: float, d_out: tuple[float, float], size: int, alpha: float
) -> tuple[float, float]:
    """Compute a two-sided confidence interval centered on ``Z``.

    :param Z: realization of cnd random variable
    :param d_in: sensitivity of the input to the mechanism
    :param d_out: privacy parameters (ε, δ)
    :param size: (approximate) total number of records in the sensitive dataset
    :param alpha: statistical significance level
    :return: confidence interval bounds
    """
    mle = max(min(Z / size, 1.0), 0.0)
    predicate = lambda B: two_sided_pvalue(B * size, d_in, d_out, size, mle) > alpha

    L = binary_search(predicate, bounds=(0.0, mle)) if mle > 0 else 0.0
    U = binary_search(predicate, bounds=(mle, 1.0)) if mle < 1 else 1.0

    return float(L), float(U)




[docs]
def one_sided_confidence_interval(
    Z: float,
    d_in: float,
    d_out: tuple[float, float],
    size: int,
    alpha: float,
    side: Literal["lower", "upper"],
) -> float:
    """Compute a one-sided confidence interval for ``self.estimate``.

    * tail="lower": return a one-sided confidence interval lower bound
    * tail="upper": return a one-sided confidence interval upper bound

    :param Z: realization of cnd random variable
    :param d_in: sensitivity of the input to the mechanism
    :param d_out: privacy parameters (ε, δ)
    :param size: (approximate) total number of records in the sensitive dataset
    :param alpha: statistical significance level
    :param side: configure interval type
    :return: lower or upper confidence interval bound
    """
    mle = max(min(Z / size, 1.0), 0.0)
    tail = {"lower": "left", "upper": "right"}.get(side)
    if tail is None:
        raise ValueError(f"tail must be 'lower' or 'upper', not {tail}")  # pragma: no cover
    pred = lambda B: one_sided_pvalue(B * size, d_in, d_out, size, mle, tail) > alpha  # type: ignore[arg-type]
    return float(binary_search(pred, bounds=(0.0, 1.0)))




[docs]
def two_sided_pvalue(
    Z: float, d_in: float, d_out: tuple[float, float], size: int, theta: float
) -> float:
    """Computes a p-value for a hypothesis test on the probability of success.

    :param Z: realization of cnd random variable
    :param d_in: sensitivity of the input to the mechanism
    :param d_out: privacy parameters (ε, δ)
    :param size: (approximate) number of trials
    :param theta: true probability of binomial distribution
    :return: Probability of observing Z when the true success rate is theta
    """
    T = abs(size * theta - Z)
    left_tail = one_sided_pvalue(
        Z=size * theta + T,
        d_in=d_in,
        d_out=d_out,
        size=size,
        theta=theta,
        tail="right",
    )
    right_tail = one_sided_pvalue(
        Z=size * theta - T,
        d_in=d_in,
        d_out=d_out,
        size=size,
        theta=theta,
        tail="right",
    )

    return float(left_tail + (1 - right_tail))




[docs]
def one_sided_pvalue(
    Z: float,
    d_in: float,
    d_out: tuple[float, float],
    size: int,
    theta: float,
    tail: Literal["left", "right"],
) -> float:
    """Computes a p-value for a hypothesis test on the probability of success.

    * tail="left": null hypothesis states that true proportion is no less than theta
    * tail="right": null hypothesis states that true proportion is no greater than theta

    :param Z: realization of cnd random variable
    :param d_in: sensitivity of the input to the mechanism
    :param d_out: privacy parameters (ε, δ)
    :param size: (approximate) number of trials
    :param theta: true probability of binomial distribution
    :param tail: either "left" or "right"
    :return: Probability of observing Z when the true success rate is theta
    """
    stats = import_optional_dependency("scipy.stats")
    np = import_optional_dependency("numpy")

    values = np.arange(size)
    B = stats.binom.pmf(k=values, n=size, p=theta)
    if tail == "right":
        F = _cnd_cdf(values, Z, d_in=d_in, d_out=d_out)
    elif tail == "left":
        F = 1 - _cnd_cdf(values, Z, d_in=d_in, d_out=d_out)
    else:
        raise ValueError(f"tail must be 'left' or 'right', not '{tail}'")  # pragma: no cover
    return float(np.dot(F.T, B))




[docs]
def one_sided_uniformly_most_powerful_tests(
    d_in: float, 
    d_out: tuple[float, float],
    size: int,
    theta: float,
    alpha: float,
    tail: Literal["left", "right"],
) -> list[float]:
    """Compute one-sided UMP tests for each choice of i in [size].

    * When tail="left", then the null hypothesis is: the estimate is at most ``i``
    * When tail="right", then the null hypothesis is: the estimate is at least ``i``

    :param d_in: sensitivity of the input to the mechanism
    :param d_out: privacy parameters (ε, δ)
    :param size: (approximate) number of trials
    :param theta: true probability of binomial distribution
    :param alpha: statistical significance level
    :param tail: either "left" or "right"
    :return: The probability of accepting the null hypothesis for each choice of i in [size].
    """
    stats = import_optional_dependency("scipy.stats")
    np = import_optional_dependency("numpy")

    values = np.arange(0, size + 1)
    B = stats.binom.pmf(k=values, n=size, p=theta)

    predicate = lambda s: np.dot(B, _cnd_cdf(values, s, d_in, d_out)) > alpha

    root = binary_search(predicate)
    phi = _cnd_cdf(values, root, d_in, d_out)

    if tail == "left":
        return phi

    if tail == "right":
        return 1 - phi

    raise ValueError(f"tail must be either 'left' or 'right', not '{tail}'")  # pragma: no cover
Branches

Releases

Source code for opendp.extras.numpy.canonical