Source code for pymc_experimental.distributions.histogram_utils

#   Copyright 2022 The PyMC Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.


from typing import Dict

import numpy as np
import pymc as pm
from numpy.typing import ArrayLike

__all__ = ["quantile_histogram", "discrete_histogram", "histogram_approximation"]


def quantile_histogram(
    data: ArrayLike, n_quantiles=1000, zero_inflation=False
) -> Dict[str, ArrayLike]:
    try:
        import xhistogram.core
    except ImportError as e:
        raise RuntimeError("quantile_histogram requires xhistogram package") from e
    try:
        import dask.array
        import dask.dataframe
    except ImportError:
        dask = None
    if dask and isinstance(data, (dask.dataframe.Series, dask.dataframe.DataFrame)):
        data = data.to_dask_array(lengths=True)
    if zero_inflation:
        zeros = (data == 0).sum(0)
        mdata = np.ma.masked_where(data == 0, data)
        qdata = data[data > 0]
    else:
        mdata = data
        qdata = data.flatten()
    quantiles = np.percentile(qdata, np.linspace(0, 100, n_quantiles))
    if dask:
        (quantiles,) = dask.compute(quantiles)
    count, _ = xhistogram.core.histogram(mdata, bins=[quantiles], axis=0)
    count = count.transpose(count.ndim - 1, *range(count.ndim - 1))
    lower = quantiles[:-1]
    upper = quantiles[1:]

    if zero_inflation:
        count = np.concatenate([zeros[None], count])
        lower = np.concatenate([[0], lower])
        upper = np.concatenate([[0], upper])
    lower = lower.reshape(lower.shape + (1,) * (count.ndim - 1))
    upper = upper.reshape(upper.shape + (1,) * (count.ndim - 1))

    result = dict(
        lower=lower,
        upper=upper,
        mid=(lower + upper) / 2,
        count=count,
    )
    return result


def discrete_histogram(data: ArrayLike, min_count=None) -> Dict[str, ArrayLike]:
    try:
        import xhistogram.core
    except ImportError as e:
        raise RuntimeError("discrete_histogram requires xhistogram package") from e
    try:
        import dask.array
        import dask.dataframe
    except ImportError:
        dask = None

    if dask and isinstance(data, (dask.dataframe.Series, dask.dataframe.DataFrame)):
        data = data.to_dask_array(lengths=True)
    mid, count_uniq = np.unique(data, return_counts=True)
    if min_count is not None:
        mid = mid[count_uniq >= min_count]
        count_uniq = count_uniq[count_uniq >= min_count]
    bins = np.concatenate([mid, [mid.max() + 1]])
    if dask:
        mid, bins = dask.compute(mid, bins)
    count, _ = xhistogram.core.histogram(data, bins=[bins], axis=0)
    count = count.transpose(count.ndim - 1, *range(count.ndim - 1))
    mid = mid.reshape(mid.shape + (1,) * (count.ndim - 1))
    return dict(mid=mid, count=count)



[docs]
def histogram_approximation(name, dist, *, observed, **h_kwargs):
    """Approximate a distribution with a histogram potential.

    Parameters
    ----------
    name : str
        Name for the Potential
    dist : TensorVariable
        The output of pm.Distribution.dist()
    observed : ArrayLike
        Observed value to construct a histogram. Histogram is computed over 0th axis.
        Dask is supported.

    Returns
    -------
    TensorVariable
        Potential

    Examples
    --------
    Discrete variables are reduced to unique repetitions (up to min_count)

    >>> import pymc as pm
    >>> import pymc_experimental as pmx
    >>> production = np.random.poisson([1, 2, 5], size=(1000, 3))
    >>> with pm.Model(coords=dict(plant=range(3))):
    ...     lam = pm.Exponential("lam", 1.0, dims="plant")
    ...     pot = pmx.distributions.histogram_approximation(
    ...         "pot", pm.Poisson.dist(lam), observed=production, min_count=2
    ...     )

    Continuous variables are discretized into n_quantiles

    >>> measurements = np.random.normal([1, 2, 3], [0.1, 0.4, 0.2], size=(10000, 3))
    >>> with pm.Model(coords=dict(tests=range(3))):
    ...     m = pm.Normal("m", dims="tests")
    ...     s = pm.LogNormal("s", dims="tests")
    ...     pot = pmx.distributions.histogram_approximation(
    ...         "pot", pm.Normal.dist(m, s),
    ...         observed=measurements, n_quantiles=50
    ...     )

    For special cases like Zero Inflation in Continuous variables there is a flag.
    The flag adds a separate bin for zeros

    >>> measurements = abs(measurements)
    >>> measurements[100:] = 0
    >>> with pm.Model(coords=dict(tests=range(3))):
    ...     m = pm.Normal("m", dims="tests")
    ...     s = pm.LogNormal("s", dims="tests")
    ...     pot = pmx.distributions.histogram_approximation(
    ...         "pot", pm.Normal.dist(m, s),
    ...         observed=measurements, n_quantiles=50, zero_inflation=True
    ...     )
    """
    try:
        import dask.array
        import dask.dataframe
    except ImportError:
        dask = None
    if dask and isinstance(observed, (dask.dataframe.Series, dask.dataframe.DataFrame)):
        observed = observed.to_dask_array(lengths=True)
    if np.issubdtype(observed.dtype, np.integer):
        histogram = discrete_histogram(observed, **h_kwargs)
    else:
        histogram = quantile_histogram(observed, **h_kwargs)
    if dask is not None:
        (histogram,) = dask.compute(histogram)
    return pm.Potential(name, pm.logp(dist, histogram["mid"]) * histogram["count"])